Open In Colab

Setup

!pip install -Uqqq kaggle
!pip install -Uqqq fastai
     |████████████████████████████████| 186 kB 16.1 MB/s 
     |████████████████████████████████| 56 kB 4.0 MB/s 
from google.colab import files

uploaded = files.upload()
uploaded.keys()

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving kaggle.json to kaggle.json
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

Baixando o dataset da competição

api.competition_download_files('petfinder-pawpularity-score')
!unzip petfinder-pawpularity-score.zip -d comp_data > /dev/null 2>&1

Explorando o CSV

train_df = pd.read_csv('/content/comp_data/train.csv')
train_df.head()
Id Subject Focus Eyes Face Near Action Accessory Group Collage Human Occlusion Info Blur Pawpularity
0 0007de18844b0dbbb5e1f607da0606e0 0 1 1 1 0 0 1 0 0 0 0 0 63
1 0009c66b9439883ba2750fb825e1d7db 0 1 1 0 0 0 0 0 0 0 0 0 42
2 0013fd999caf9a3efe1352ca1b0d937e 0 1 1 1 0 0 0 0 1 1 0 0 28
3 0018df346ac9c1d8413cfcc888ca8246 0 1 1 1 0 0 0 0 0 0 0 0 15
4 001dc955e10590d3ca4673f034feeef2 0 0 0 1 0 0 1 0 0 0 0 0 72
train_df['Pawpularity'].nunique()
100
train_df['Pawpularity'].value_counts()
28    318
30    318
26    316
31    312
29    304
     ... 
98     10
97      8
90      7
1       4
99      4
Name: Pawpularity, Length: 100, dtype: int64
sns.set(rc={'figure.figsize':(14,7)})
fig = plt.figure()
sns.histplot(data=train_df, x='Pawpularity', kde=True)
plt.title('Histograma do score "Pawpularity"', fontsize=20, fontweight='bold')
plt.show()
sns.set(rc={'figure.figsize':(21,15)})
predictor = train_df.columns[1:-1]

fig = plt.figure()
for i, p in enumerate(predictor):
    ax = plt.subplot(3,4,i+1)
    sns.countplot(data=train_df, x=p, ax=ax)
    ax.set_xlabel(None)
    ax.set_title(p, fontweight='bold', color="#e7273e", y=-0.1)
plt.show()
sns.set(rc={'figure.figsize':(12,10)})
predictor = train_df.columns[1:]
corr_matrix = train_df[predictor].corr()
fig = plt.figure()
sns.set_theme(style="white")
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix, annot=True, fmt='.1g', cmap=cmap, 
            mask=mask, square=True)
plt.title('Matriz de Correlação', fontsize=20, fontweight='bold')
plt.show()

Explorando as fotos

train_path = Path('/content/comp_data/train')
train_df['path'] = train_df['Id'].apply(lambda x: train_path/(x + '.jpg'))
plt.imshow(plt.imread(train_df.loc[0, 'path']));

Fotos por Pawpularity

def plot_pawpularity(score, n):
    imgs = train_df.query(f'Pawpularity == {score}').sample(n)['path']
    for i, img in enumerate(imgs):
        img = plt.imread(img)
        plt.subplot(1, n, i+1)
        plt.title(score) 
        plt.axis('off')
        plt.imshow(img)
plot_pawpularity(10, 4)
plot_pawpularity(30, 4)
plot_pawpularity(90, 4)

Baseline Simples

def RMSE(pred, targ):
    return (((pred - targ)**2).mean())**0.5
sns.set(rc={'figure.figsize':(8,5)})
train_data = pd.read_csv('/content/comp_data/train.csv')
n_samp = len(train_data)
np.random.seed(42)
split = np.random.randint(0, n_samp, n_samp)
split
array([7270,  860, 5390, ..., 4926, 9582, 9304])
train = train_data['Pawpularity'].values[split[:int(0.8*n_samp)]]
test = train_data['Pawpularity'].values[split[int(0.8*n_samp):]]
rmse = []
for n in range(10000):
    preds = np.random.choice(train, len(test), replace=False)
    rmse.append(RMSE(preds, test))
rmse = np.array(rmse)
plt.hist(rmse, bins = 30);
plt.hist(train)
plt.hist(preds)
plt.hist(test);

Baseline mais simples ainda

preds = train.mean()
RMSE(test, preds)
20.918360445610613
preds = np.median(train)
RMSE(test, preds)
21.472413666461463
from scipy.stats import mode
preds = mode(train)[0].item()
RMSE(test, preds)
22.342630590521075

Influencia de gato/cachorro

from fastai.vision.all import *
path = untar_data(URLs.PETS)/'images'
def labeler(x): return 'Gato' if x.name[0].isupper() else 'Cachorro'
dblock = DataBlock(
    blocks = [ImageBlock, CategoryBlock],
    splitter = RandomSplitter(valid_pct = 0.2, seed = 42),
    get_items = get_image_files,
    get_y = labeler,
    item_tfms = Resize(256),
)
dls = dblock.dataloaders(path)
learn = cnn_learner(dls, resnet50, metrics=accuracy)
learn.fine_tune(5)
100.00% [811712512/811706944 00:25<00:00]
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
epoch train_loss valid_loss accuracy time
0 0.097311 0.009561 0.997294 00:27
epoch train_loss valid_loss accuracy time
0 0.042957 0.043040 0.990528 00:31
1 0.031512 0.006856 0.996617 00:31
2 0.027372 0.008505 0.996617 00:31
3 0.009047 0.003232 0.998647 00:31
4 0.004652 0.003530 0.998647 00:31

Inferencia

dls = DataBlock(
    blocks = [ImageBlock],
    get_x = ColReader('path'),
    item_tfms = Resize(256),
).dataloaders(train_df).test_dl(train_df)
dls.show_batch()
preds, _ = learn.get_preds(dl = dls)
train_df['tipo'] = ['Cachorro' if p > 0.5 else 'Gato' for p in preds[:,0]]
train_df['tipo'].value_counts()
Gato        5969
Cachorro    3943
Name: tipo, dtype: int64
sns.set(rc={'figure.figsize':(14,7)})
fig, ax = plt.subplots(1,2)
sns.boxplot(data=train_df, x='tipo', y='Pawpularity', ax=ax[0])
sns.histplot(train_df, x="Pawpularity", hue="tipo", kde=True, ax=ax[1])
fig.show()
train_df.groupby('tipo')['Pawpularity'].mean()
tipo
Cachorro    41.903119
Gato        35.486514
Name: Pawpularity, dtype: float64
train = train_df.loc[split[:int(0.8*n_samp)]]
test = train_df.loc[split[int(0.8*n_samp):]]
m_cachorro, m_gato = train.groupby('tipo')['Pawpularity'].mean().values
preds = test['tipo'].apply(lambda x: m_cachorro if x == 'Cachorro' else m_gato).values
RMSE(test['Pawpularity'], preds)
20.68950148400463