EDA da competição Sartorius Cell Instance Segmentation - Kaggle
Tutorial de EDA para a competição Kaggle
- Setup
- Baixando o dataset da competição
- Explorando o CSV
- Explorando as fotos
- Entendendo as máscaras 😷
- Transformando em uma função
- Comparando com a foto
!pip install -Uqqq kaggle
!pip install -Uqqq fastai
from google.colab import files
uploaded = files.upload()
uploaded.keys()
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
api.competition_download_files('sartorius-cell-instance-segmentation')
!unzip sartorius-cell-instance-segmentation.zip -d comp_data > /dev/null 2>&1
df = pd.read_csv('/content/comp_data/train.csv')
df.tail()
img_shape = (704, 520)
df = df[['id', 'annotation', 'cell_type']]
df.head()
df['cell_type'].value_counts()
df['id'].nunique()
df['id'].value_counts()
plt_df = df.groupby('cell_type')['id'].value_counts().to_frame(name = 'n_seg').reset_index()
plt_df
sns.histplot(x = 'n_seg', data = plt_df);
sns.histplot(x = 'n_seg', hue = 'cell_type', data = plt_df);
sns.boxplot(x = 'cell_type', y='n_seg', data = plt_df);
df.groupby('id')['cell_type'].value_counts(normalize=True).describe()
from fastai.vision.all import *
train_path = Path('/content/comp_data/train')
files = get_image_files(train_path)
files
img = PILImageBW.create(files[0])
img
shape = img.shape
mask = df.loc[0, 'annotation']
mask
s = mask.split()
L(s)
s[0], s[2], s[4], s[6]
s[1], s[3], s[5], s[7]
starts = np.array(s[0::2], dtype = 'int')
lenghts = np.array(s[1::2], dtype = 'int')
starts, lenghts
mask_decompressed = np.concatenate([np.arange(s, s + l + 1) for s, l in zip(starts, lenghts)])
mask_decompressed[:20]
msk_img = np.zeros((shape[0] * shape[1]))
msk_img[mask_decompressed] = 1
msk_img = msk_img.reshape(shape)
plt.imshow(msk_img);
def rle_decode(mask_rle, shape = (520, 704)):
mask_splitted = mask_rle.split()
starts = np.array(mask_splitted[0::2], dtype = 'int')
lengths = np.array(mask_splitted[1::2], dtype = 'int')
mask_decompressed = np.concatenate([np.arange(s, s + l + 1) for s, l in zip(starts, lengths)])
msk_img = np.zeros((shape[0] * shape[1], 1), dtype = np.bool)
msk_img[mask_decompressed] = 1
return msk_img.reshape(shape)
plt.imshow(rle_decode(mask));
df = df.set_index('id')
df
df.loc['ffdb3cc02eef'].tail()
masks = df.loc['ffdb3cc02eef', 'annotation'].values
masks[:3]
plt.imshow(rle_decode(masks[0]));
plt.imshow(rle_decode(masks[-1]));
mask_imgs = [rle_decode(m) for m in masks]
len(mask_imgs)
mask_img = np.stack(mask_imgs)
mask_img.shape
mask_img = mask_img.sum(axis = 0)
mask_img.shape
plt.imshow(mask_img);
mask_img.min(), mask_img.max()
mask_img = mask_img.astype(np.bool)
plt.imshow(mask_img);
file = train_path / ('ffdb3cc02eef' + '.png')
img = PILImageBW.create(file)
fig, ax = plt.subplots(1,2,figsize=(13, 6))
ax[0].imshow(np.array(img), cmap = 'Greys')
ax[0].axis("off")
ax[1].imshow(mask_img)
ax[1].axis("off")
plt.tight_layout();