EDA Kaggle - Jigsaw Rate Severity of Toxic Comments
EDA para a competição Kaggle
- Setup
- Baixando o dataset da competição
- Explorando os .csv
- Criando IDs para os comentários
- Quantos pares cada rotulador processou?
- Qual a frequência de cada frase?
- Uma mesma comparação é feita mais de uma vez?
- Quantos pares tiveram unanimidade entre os 3 rotuladores?
- Há alguns revisores particularmente ruins ou bons?
- Nuvem de palavras
- Todos os comentários são em Inglês??
- Análise de sentimentos usando modelo pré treinado
!pip install -Uqqq kaggle
!pip install -Uqqq plotnine
from google.colab import files
uploaded = files.upload()
uploaded.keys()
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
from plotnine import *
api.competition_download_files('jigsaw-toxic-severity-rating')
!unzip jigsaw-toxic-severity-rating.zip > /dev/null 2>&1
!rm jigsaw-toxic-severity-rating.zip
import pandas as pd
import numpy as np
df_test = pd.read_csv('/content/comments_to_score.csv')
df_test.tail()
df_comp = pd.read_csv('/content/validation_data.csv')
df_comp.tail()
unique_comments = pd.Series(df_comp.melt('worker')['value'].unique())
len(unique_comments), unique_comments[0]
unique_comments = unique_comments.str.strip().unique()
len(unique_comments), unique_comments[0]
id2txt = {k:v for k, v in enumerate(unique_comments)}
txt2id = {k:v for v, k in enumerate(unique_comments)}
id2txt[0]
txt2id['This article sucks \n\nwoo woo wooooooo']
df_comp['less_toxic_id'] = df_comp['less_toxic'].str.strip().apply(lambda x: txt2id[x] if x in txt2id else -999)
df_comp['more_toxic_id'] = df_comp['more_toxic'].str.strip().apply(lambda x: txt2id[x] if x in txt2id else -999)
df_comp
(df_comp['less_toxic_id'] < 0).mean() + (df_comp['more_toxic_id'] < 0).mean()
df_comp['worker'].nunique()
df = df_comp['worker'].value_counts().to_frame('n')
df.T
(ggplot(df, aes('n'))
+ geom_histogram(bins = 20, fill = 'orange', color = 'black')
+ ggtitle('Número de rótulos por "worker"')
)
df = pd.concat([df_comp['less_toxic_id'], df_comp['more_toxic_id']]).value_counts().to_frame('n_comparisons')
df.T
id2txt[1801]
df.value_counts().to_frame('count').sort_index().reset_index().style.bar(color = 'orange')
df_comp['hashed_comp'] = df_comp[['less_toxic_id', 'more_toxic_id']].astype(str).apply(lambda x: '-'.join(sorted(x)), axis=1)
df = df_comp['hashed_comp'].value_counts().to_frame('n')
df.T
(ggplot(df, aes('n'))
+ geom_bar(fill = 'orange', color = 'black')
+ ggtitle('Número de comparassões por "par"')
)
df_comp['hashed_comp'] = df_comp[['less_toxic_id', 'more_toxic_id']].astype(str).apply(lambda x: '-'.join(x), axis=1)
df = df_comp['hashed_comp'].value_counts().to_frame('n')
df.T
df = df.loc[df['n'] > 1]
df['unanimous'] = df['n'] == 3
(ggplot(df, aes('unanimous'))
+ geom_bar(fill = 'orange', color = 'black')
)
df_comp = df_comp.set_index('hashed_comp').join(df).dropna()
df_comp.tail()
df = df_comp.groupby('worker')['unanimous'].value_counts().to_frame('count').reset_index()
df = df.pivot('worker', 'unanimous', 'count').reset_index()
df['total_reviews'] = df[True] + df[False]
df['unan_frac'] = df[True] / df['total_reviews']
df = df.dropna()
df
df['worker'] = df['worker'].astype('category')
df.dropna().sort_values('unan_frac', ascending = False)
_df = df.dropna().sort_values('unan_frac', ascending = False).head(20)
(ggplot(_df,
aes(x = 'worker', y = 'unan_frac'))
+ geom_bar(stat = 'identity', fill = 'orange', color = 'black')
+ ggtitle('Frequência de que um rotulador tem unanimidade')
+ coord_flip()
+ scale_x_discrete(limits = _df['worker'][::-1])
)
_df = df.dropna().sort_values('unan_frac', ascending = False).tail(20)
(ggplot(_df,
aes(x = 'worker', y = 'unan_frac'))
+ geom_bar(stat = 'identity', fill = 'orange', color = 'black')
+ ggtitle('Frequência de que um rotulador tem unanimidade')
+ coord_flip()
+ scale_x_discrete(limits = _df['worker'][::-1])
)
(ggplot(df, aes(x = 'total_reviews', y ='unan_frac'))
+ geom_hline(yintercept = 0.5, color = 'red')
+ geom_point(alpha = 0.5, color = 'orange')
+ geom_smooth(method='lm', color = 'red')
)
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
non_toxic_comments = df_comp['less_toxic'].value_counts() \
.to_frame().head(1000)
non_toxic_text = ' '.join(non_toxic_comments.index.tolist())
toxic_comments = df_comp['more_toxic'].value_counts() \
.to_frame().head(1000)
toxic_text = ' '.join(toxic_comments.index.tolist())
wordcloud = WordCloud(max_font_size=50, max_words=100,width=500, height=500,
background_color="white") \
.generate(non_toxic_text)
wordcloud2 = WordCloud(max_font_size=50, max_words=100,width=500, height=500,
background_color="black") \
.generate(toxic_text)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(15,15))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis("off")
ax2.imshow(wordcloud2, interpolation="bilinear")
ax2.axis("off")
ax1.set_title('Comentários MENOS tóxicos', fontsize=25)
ax2.set_title('Comentários MAIS tóxicos', fontsize=25)
plt.show()
df = pd.concat([
pd.DataFrame({
'comment': non_toxic_comments.index.values,
'len': list(map(len, non_toxic_comments.index.values)),
'toxic': 'least'
}),
pd.DataFrame({
'comment': toxic_comments.index.values,
'len': list(map(len, toxic_comments.index.values)),
'toxic': 'most'
})
])
df['toxic'] = df['toxic'].astype('category')
df
df_mean = df[df['len'] < 500].groupby('toxic').median().reset_index()
df_mean
(ggplot(df, aes(x = 'len', fill = 'toxic'))
+ geom_density(alpha = 0.5)
+ geom_vline(df_mean, aes(xintercept = 'len', color = 'toxic'))
# + facet_grid('toxic~.')
+ xlim(0, 500)
)
!pip install -Uqqq pyicu
!pip install -Uqqq pycld2
!pip install -Uqqq morfessor
!pip install -Uqqq polyglot
from polyglot.detect import Detector
def get_language(text):
return Detector("".join(x for x in text if x.isprintable()), quiet=True).languages[0].name
%%capture
langs = [get_language(comment) for comment in unique_comments]
df = pd.DataFrame({
'text': unique_comments,
'lang': langs
})
df.tail()
df['lang'].value_counts().to_frame().T
df[df['lang'] == 'un']
df[df['lang'] == 'German']
df[df['lang'] == 'Quechua']
!pip install -Uqqq transformers
from transformers import pipeline
from tqdm.notebook import tqdm
def predict(text):
try:
p = classifier(text)[0]
df = pd.DataFrame(p).set_index('label').T
df['text'] = text
return df
except:
return None
classifier = pipeline("text-classification", model='distilbert-base-uncased-finetuned-sst-2-english', return_all_scores=True)
unique_comments[1]
predict(unique_comments[1])
predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)
(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)
(predictions['NEGATIVE'] > 0.5).mean()
predictions.sort_values('NEGATIVE').reset_index().loc[1, 'text']
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
unique_comments[0]
predict(unique_comments[0])
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True, function_to_apply = 'sigmoid')
predict(unique_comments[0])
predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)
(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)
predictions.sort_values('love', ascending = False).reset_index().loc[1, 'text']
(ggplot(predictions, aes(x = 'anger', y = 'fear'))
+ geom_point(color = 'orange', alpha = 0.5)
)
classifier = pipeline("text-classification", model='unitary/toxic-bert', return_all_scores=True)
predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)
(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)
for txt in predictions.sort_values('toxic').tail(5)['text']:
print('-'*20)
print(txt)
for txt in predictions.sort_values('toxic').head(5)['text']:
print('-'*20)
print(txt)