import pandas as pd
% matplotlib inline
df_census = pd.read_csv(‘census_income_data.csv’)
df_census.info()
df_census.hist(figsize=8, 8));
df_census[‘age’].hist()
df_census[‘age’].plot(kind=’hist’);
df_census[‘education’].value_counts() #aggregates counts for each unique value in a column
df_census[‘education’].value_counts().plot(kind=’bar’)
df_census[‘education’].value_counts().plot(kind=’pie’, figsize=(8, 8));
df_cancer = pd.read_csv(‘cancer_data_edited.csv’)
pd.plotting.scatter_matrix(df_cancer, figsize=(15, 15));
df_cancer.plot(x=’compactness’, y=’concavity’, kind=’scatter’);
df_cancer[‘concave_points’].plot(kind=’box’);
import pandas as pd
df = pd.read_csv(‘cancer_data_edited.csv’)
df.head()
df_m = df[df[‘diagnosis’] == ‘M’]
df_m.head()
mask = df[‘diagnosis’] == ‘M’
df_m = df[mask]
df_m
df_m[‘area’].describe()
df_b = df[df[‘diagnosis’] == ‘B’]
df_b[‘area’].describe()
import matplotlib.pyplot as plt
% matplotlib inline
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(df_b[‘area’], alpha = 0.5, label=’benign’)
ax.hist(df_m[‘area’], alpha = 0.5, label=’malignant’)
ax.set_title(‘Distribution of Benign and Malignant Tumor Areas’)
ax.set_xlabel(‘Area’)
ax.set_ylabel(‘Count’)
ax.legend(loc=’upper right’)
plt.show
Exploring Data with Visuals Quiz
# imports and load data
import pandas as pd
% matplotlib inline
ppdata = pd.read_csv(‘powerplant_data_edited.csv’)
ppdata.info()
# plot relationship between temperature and electrical output
ppdata.plot(x=’Temperature’, y=’Net hourly electrical energy output’, kind=’scatter’);
# plot distribution of humidity
ppdata[‘Relative Humidity’].plot(kind=’hist’);
# plot box plots for each variable
ppdata.plot(kind=’box’);
EDA with Visuals
Create visualizations to answer the quiz questions below this notebook.
wine_df[‘fixed_acidity’].plot(kind=’hist’)
wine_df.plot(x=’fixed_acidity’, y=’quality’, kind=’scatter’)
Drawing Conclusions Using Groupby
# Load `winequality_edited.csv`
import pandas as pd
df = pd.read_csv(‘winequality_edited.csv’)
Is a certain type of wine associated with higher quality?
# Find the mean quality of each wine type (red and white) with groupby
df.groupby(‘color’).mean()