import pandas as pd

% matplotlib inline

df_census = pd.read_csv(‘census_income_data.csv’)

df_census.info()

df_census.hist(figsize=8, 8));

df_census[‘age’].hist()

df_census[‘age’].plot(kind=’hist’);

df_census[‘education’].value_counts() #aggregates counts for each unique value in a column

df_census[‘education’].value_counts().plot(kind=’bar’)

df_census[‘education’].value_counts().plot(kind=’pie’, figsize=(8, 8));

 

df_cancer = pd.read_csv(‘cancer_data_edited.csv’)

pd.plotting.scatter_matrix(df_cancer, figsize=(15, 15));

df_cancer.plot(x=’compactness’, y=’concavity’, kind=’scatter’);

df_cancer[‘concave_points’].plot(kind=’box’);

 

import pandas as pd

df = pd.read_csv(‘cancer_data_edited.csv’)

df.head()

df_m = df[df[‘diagnosis’] == ‘M’]

df_m.head()

mask = df[‘diagnosis’] == ‘M’

df_m = df[mask]

df_m

df_m[‘area’].describe()

df_b = df[df[‘diagnosis’] == ‘B’]

df_b[‘area’].describe()

 

import matplotlib.pyplot as plt

% matplotlib inline

fig, ax = plt.subplots(figsize=(8,6))
ax.hist(df_b[‘area’], alpha = 0.5, label=’benign’)
ax.hist(df_m[‘area’], alpha = 0.5, label=’malignant’)
ax.set_title(‘Distribution of Benign and Malignant Tumor Areas’)
ax.set_xlabel(‘Area’)
ax.set_ylabel(‘Count’)
ax.legend(loc=’upper right’)
plt.show

 

Exploring Data with Visuals Quiz

# imports and load data
import pandas as pd
% matplotlib inline
ppdata = pd.read_csv(‘powerplant_data_edited.csv’)

ppdata.info()

# plot relationship between temperature and electrical output
ppdata.plot(x=’Temperature’, y=’Net hourly electrical energy output’, kind=’scatter’);

# plot distribution of humidity
ppdata[‘Relative Humidity’].plot(kind=’hist’);

# plot box plots for each variable
ppdata.plot(kind=’box’);

 

EDA with Visuals

Create visualizations to answer the quiz questions below this notebook.

wine_df[‘fixed_acidity’].plot(kind=’hist’)

wine_df.plot(x=’fixed_acidity’, y=’quality’, kind=’scatter’)

Drawing Conclusions Using Groupby

# Load `winequality_edited.csv`
import pandas as pd
df = pd.read_csv(‘winequality_edited.csv’)

Is a certain type of wine associated with higher quality?

# Find the mean quality of each wine type (red and white) with groupby
df.groupby(‘color’).mean()