Como Fazer uma EDA (competição PETS-Kaggle 🙀x🐶)
Tutorial de EDA para a competição Kaggle
- Setup
- Baixando o dataset da competição
- Explorando o CSV
- Explorando as fotos
- Fotos por Pawpularity
- Baseline Simples
- Baseline mais simples ainda
- Influencia de gato/cachorro
!pip install -Uqqq kaggle
!pip install -Uqqq fastai
from google.colab import files
uploaded = files.upload()
uploaded.keys()
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
api.competition_download_files('petfinder-pawpularity-score')
!unzip petfinder-pawpularity-score.zip -d comp_data > /dev/null 2>&1
train_df = pd.read_csv('/content/comp_data/train.csv')
train_df.head()
train_df['Pawpularity'].nunique()
train_df['Pawpularity'].value_counts()
sns.set(rc={'figure.figsize':(14,7)})
fig = plt.figure()
sns.histplot(data=train_df, x='Pawpularity', kde=True)
plt.title('Histograma do score "Pawpularity"', fontsize=20, fontweight='bold')
plt.show()
sns.set(rc={'figure.figsize':(21,15)})
predictor = train_df.columns[1:-1]
fig = plt.figure()
for i, p in enumerate(predictor):
ax = plt.subplot(3,4,i+1)
sns.countplot(data=train_df, x=p, ax=ax)
ax.set_xlabel(None)
ax.set_title(p, fontweight='bold', color="#e7273e", y=-0.1)
plt.show()
sns.set(rc={'figure.figsize':(12,10)})
predictor = train_df.columns[1:]
corr_matrix = train_df[predictor].corr()
fig = plt.figure()
sns.set_theme(style="white")
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix, annot=True, fmt='.1g', cmap=cmap,
mask=mask, square=True)
plt.title('Matriz de Correlação', fontsize=20, fontweight='bold')
plt.show()
train_path = Path('/content/comp_data/train')
train_df['path'] = train_df['Id'].apply(lambda x: train_path/(x + '.jpg'))
plt.imshow(plt.imread(train_df.loc[0, 'path']));
def plot_pawpularity(score, n):
imgs = train_df.query(f'Pawpularity == {score}').sample(n)['path']
for i, img in enumerate(imgs):
img = plt.imread(img)
plt.subplot(1, n, i+1)
plt.title(score)
plt.axis('off')
plt.imshow(img)
plot_pawpularity(10, 4)
plot_pawpularity(30, 4)
plot_pawpularity(90, 4)
def RMSE(pred, targ):
return (((pred - targ)**2).mean())**0.5
sns.set(rc={'figure.figsize':(8,5)})
train_data = pd.read_csv('/content/comp_data/train.csv')
n_samp = len(train_data)
np.random.seed(42)
split = np.random.randint(0, n_samp, n_samp)
split
train = train_data['Pawpularity'].values[split[:int(0.8*n_samp)]]
test = train_data['Pawpularity'].values[split[int(0.8*n_samp):]]
rmse = []
for n in range(10000):
preds = np.random.choice(train, len(test), replace=False)
rmse.append(RMSE(preds, test))
rmse = np.array(rmse)
plt.hist(rmse, bins = 30);
plt.hist(train)
plt.hist(preds)
plt.hist(test);
preds = train.mean()
RMSE(test, preds)
preds = np.median(train)
RMSE(test, preds)
from scipy.stats import mode
preds = mode(train)[0].item()
RMSE(test, preds)
from fastai.vision.all import *
path = untar_data(URLs.PETS)/'images'
def labeler(x): return 'Gato' if x.name[0].isupper() else 'Cachorro'
dblock = DataBlock(
blocks = [ImageBlock, CategoryBlock],
splitter = RandomSplitter(valid_pct = 0.2, seed = 42),
get_items = get_image_files,
get_y = labeler,
item_tfms = Resize(256),
)
dls = dblock.dataloaders(path)
learn = cnn_learner(dls, resnet50, metrics=accuracy)
learn.fine_tune(5)
dls = DataBlock(
blocks = [ImageBlock],
get_x = ColReader('path'),
item_tfms = Resize(256),
).dataloaders(train_df).test_dl(train_df)
dls.show_batch()
preds, _ = learn.get_preds(dl = dls)
train_df['tipo'] = ['Cachorro' if p > 0.5 else 'Gato' for p in preds[:,0]]
train_df['tipo'].value_counts()
sns.set(rc={'figure.figsize':(14,7)})
fig, ax = plt.subplots(1,2)
sns.boxplot(data=train_df, x='tipo', y='Pawpularity', ax=ax[0])
sns.histplot(train_df, x="Pawpularity", hue="tipo", kde=True, ax=ax[1])
fig.show()
train_df.groupby('tipo')['Pawpularity'].mean()
train = train_df.loc[split[:int(0.8*n_samp)]]
test = train_df.loc[split[int(0.8*n_samp):]]
m_cachorro, m_gato = train.groupby('tipo')['Pawpularity'].mean().values
preds = test['tipo'].apply(lambda x: m_cachorro if x == 'Cachorro' else m_gato).values
RMSE(test['Pawpularity'], preds)