Setup

!pip install -Uqqq kaggle
!pip install -Uqqq plotnine

from google.colab import files

uploaded = files.upload()
uploaded.keys()

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

from plotnine import *

Baixando o dataset da competição

api.competition_download_files('jigsaw-toxic-severity-rating')

!unzip jigsaw-toxic-severity-rating.zip > /dev/null 2>&1
!rm jigsaw-toxic-severity-rating.zip

Explorando os .csv

import pandas as pd
import numpy as np

df_test = pd.read_csv('/content/comments_to_score.csv')
df_test.tail()

df_comp = pd.read_csv('/content/validation_data.csv')
df_comp.tail()

Criando IDs para os comentários

Extraindo os comentários únicos

unique_comments = pd.Series(df_comp.melt('worker')['value'].unique())
len(unique_comments), unique_comments[0]

(14251, ' This article sucks \n\nwoo woo wooooooo')

'Limpando' os textos

unique_comments = unique_comments.str.strip().unique()
len(unique_comments), unique_comments[0]

(14244, 'This article sucks \n\nwoo woo wooooooo')

Criando um dicionário Comentário -> ID e ID -> Comentário

id2txt = {k:v for k, v in enumerate(unique_comments)}
txt2id = {k:v for v, k in enumerate(unique_comments)}

id2txt[0]

'This article sucks \n\nwoo woo wooooooo'

txt2id['This article sucks \n\nwoo woo wooooooo']

0

Criando colunas com os IDs dos comentários

df_comp['less_toxic_id'] = df_comp['less_toxic'].str.strip().apply(lambda x: txt2id[x] if x in txt2id else -999)
df_comp['more_toxic_id'] = df_comp['more_toxic'].str.strip().apply(lambda x: txt2id[x] if x in txt2id else -999)

df_comp

Verificando quantos 'falharam'

(df_comp['less_toxic_id'] < 0).mean() + (df_comp['more_toxic_id'] < 0).mean()

0.0

Quantos pares cada rotulador processou?

df_comp['worker'].nunique()

753

df = df_comp['worker'].value_counts().to_frame('n')
df.T

(ggplot(df, aes('n'))
+ geom_histogram(bins = 20, fill = 'orange', color = 'black')
+ ggtitle('Número de rótulos por "worker"')
)

<ggplot: (8734911854821)>

Qual a frequência de cada frase?

df = pd.concat([df_comp['less_toxic_id'], df_comp['more_toxic_id']]).value_counts().to_frame('n_comparisons')
df.T

id2txt[1801]

'NAZI WIKIPEDIA ADMINS\nEVERYONE KNOWS THEY LOCK THESE ARTICLES SO ONLY THE LIBERAL NAZI WIKI ADMINS CAN CHANGE THE INFORMATION. THIS SITE IS THE WORSE WHEN IT COMES TO BEING FAIR!! ALL THE ADMINS VOTE THEIR FRIENDS IN AS ADMINS. IT IS A BIASED WEBSITE AND THEY DO NOT ALLOW INFORMATION THAT ALLOWS THEIR POLITICAL CANDIDATES TO LOOK BAD. THIS IS WHAT THEY DO IN CHINA AND NOW ON WIKIPEDIA.'

df.value_counts().to_frame('count').sort_index().reset_index().style.bar(color = 'orange')

Uma mesma comparação é feita mais de uma vez?

df_comp['hashed_comp'] = df_comp[['less_toxic_id', 'more_toxic_id']].astype(str).apply(lambda x: '-'.join(sorted(x)), axis=1)
df = df_comp['hashed_comp'].value_counts().to_frame('n')
df.T

(ggplot(df, aes('n'))
+ geom_bar(fill = 'orange', color = 'black')
+ ggtitle('Número de comparassões por "par"')
)

<ggplot: (8734865383177)>

Quantos pares tiveram unanimidade entre os 3 rotuladores?

df_comp['hashed_comp'] = df_comp[['less_toxic_id', 'more_toxic_id']].astype(str).apply(lambda x: '-'.join(x), axis=1)
df = df_comp['hashed_comp'].value_counts().to_frame('n')
df.T

df = df.loc[df['n'] > 1]
df['unanimous'] = df['n'] == 3

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

(ggplot(df, aes('unanimous'))
+ geom_bar(fill = 'orange', color = 'black')
)

<ggplot: (8734855661133)>

Há alguns revisores particularmente ruins ou bons?

df_comp = df_comp.set_index('hashed_comp').join(df).dropna()
df_comp.tail()

df = df_comp.groupby('worker')['unanimous'].value_counts().to_frame('count').reset_index()
df = df.pivot('worker', 'unanimous', 'count').reset_index()
df['total_reviews'] = df[True] + df[False]
df['unan_frac'] = df[True] / df['total_reviews']
df = df.dropna()
df

df['worker'] = df['worker'].astype('category')
df.dropna().sort_values('unan_frac', ascending = False)

_df = df.dropna().sort_values('unan_frac', ascending = False).head(20)

(ggplot(_df, 
        aes(x = 'worker', y = 'unan_frac'))
+ geom_bar(stat = 'identity', fill = 'orange', color = 'black')
+ ggtitle('Frequência de que um rotulador tem unanimidade')
+ coord_flip()
+ scale_x_discrete(limits = _df['worker'][::-1])
)

<ggplot: (8734855681721)>

_df = df.dropna().sort_values('unan_frac', ascending = False).tail(20)

(ggplot(_df, 
        aes(x = 'worker', y = 'unan_frac'))
+ geom_bar(stat = 'identity', fill = 'orange', color = 'black')
+ ggtitle('Frequência de que um rotulador tem unanimidade')
+ coord_flip()
+ scale_x_discrete(limits = _df['worker'][::-1])
)

<ggplot: (8734855695925)>

(ggplot(df, aes(x = 'total_reviews', y ='unan_frac'))
+ geom_hline(yintercept = 0.5, color = 'red')
+ geom_point(alpha = 0.5, color = 'orange')
+ geom_smooth(method='lm', color = 'red')
)

<ggplot: (8734855699909)>

Nuvem de palavras

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

non_toxic_comments = df_comp['less_toxic'].value_counts() \
    .to_frame().head(1000)
non_toxic_text = ' '.join(non_toxic_comments.index.tolist())
toxic_comments = df_comp['more_toxic'].value_counts() \
    .to_frame().head(1000)
toxic_text = ' '.join(toxic_comments.index.tolist())

wordcloud = WordCloud(max_font_size=50, max_words=100,width=500, height=500,
                      background_color="white") \
    .generate(non_toxic_text)
wordcloud2 = WordCloud(max_font_size=50, max_words=100,width=500, height=500,
                      background_color="black") \
    .generate(toxic_text)

fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(15,15))

ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis("off")
ax2.imshow(wordcloud2, interpolation="bilinear")
ax2.axis("off")
ax1.set_title('Comentários MENOS tóxicos', fontsize=25)
ax2.set_title('Comentários MAIS tóxicos', fontsize=25)
plt.show()

Comprimento dos comentarios

df = pd.concat([
    pd.DataFrame({
        'comment': non_toxic_comments.index.values,
        'len': list(map(len, non_toxic_comments.index.values)),
        'toxic': 'least'
    }),
    pd.DataFrame({
        'comment': toxic_comments.index.values,
        'len': list(map(len, toxic_comments.index.values)),
        'toxic': 'most'
    })
])
df['toxic'] = df['toxic'].astype('category')
df

df_mean = df[df['len'] < 500].groupby('toxic').median().reset_index()
df_mean

(ggplot(df, aes(x = 'len', fill = 'toxic'))
+ geom_density(alpha = 0.5)
+ geom_vline(df_mean, aes(xintercept = 'len', color = 'toxic'))
# + facet_grid('toxic~.')
+ xlim(0, 500)
)

/usr/local/lib/python3.7/dist-packages/plotnine/layer.py:324: PlotnineWarning: stat_density : Removed 392 rows containing non-finite values.

<ggplot: (8734855708481)>

Todos os comentários são em Inglês??

!pip install -Uqqq pyicu
!pip install -Uqqq pycld2
!pip install -Uqqq morfessor
!pip install -Uqqq polyglot

from polyglot.detect import Detector

def get_language(text):
    return Detector("".join(x for x in text if x.isprintable()), quiet=True).languages[0].name

%%capture
langs = [get_language(comment) for comment in unique_comments]

df = pd.DataFrame({
    'text': unique_comments,
    'lang': langs
})
df.tail()

df['lang'].value_counts().to_frame().T

df[df['lang'] == 'un']

df[df['lang'] == 'German']

df[df['lang'] == 'Quechua']

Análise de sentimentos usando modelo pré treinado

!pip install -Uqqq transformers

from transformers import pipeline
from tqdm.notebook import tqdm

def predict(text):
    try:
        p = classifier(text)[0]
        df = pd.DataFrame(p).set_index('label').T
        df['text'] = text
        return df
    except:
        return None

Positivo vs Negativo

classifier = pipeline("text-classification", model='distilbert-base-uncased-finetuned-sst-2-english', return_all_scores=True)

unique_comments[1]

'"And yes, people should recognize that but they usually don\'t. One of the first objections you hear directed at someone who says he favors free markets is invariably ""There\'s no such thing as a free market."" Ridiculously trivial. "'

predict(unique_comments[1])

predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)

Token indices sequence length is longer than the specified maximum sequence length for this model (756 > 512). Running this sequence through the model will result in indexing errors

(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)

<ggplot: (8734462527733)>

(predictions['NEGATIVE'] > 0.5).mean()

0.8884297520661157

predictions.sort_values('NEGATIVE').reset_index().loc[1, 'text']

"My page \n\nI would like to thank you for reverting the vandalism to my page quickly it's most appreciated."

Sentimentos comuns

classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

unique_comments[0]

'This article sucks \n\nwoo woo wooooooo'

predict(unique_comments[0])

classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True, function_to_apply = 'sigmoid')

predict(unique_comments[0])

predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)

Token indices sequence length is longer than the specified maximum sequence length for this model (756 > 512). Running this sequence through the model will result in indexing errors

(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)

<ggplot: (8734462581465)>

predictions.sort_values('love', ascending = False).reset_index().loc[1, 'text']

"Eejit\n\nHe's an awful eejit. Could some writer-type person please say that? Much appreciated. 86.42.90.145"

(ggplot(predictions, aes(x = 'anger', y = 'fear'))
+ geom_point(color = 'orange', alpha = 0.5)
)

<ggplot: (8734462496585)>

Sentimentos tóxicos

classifier = pipeline("text-classification", model='unitary/toxic-bert', return_all_scores=True)

predictions = [predict(text) for text in tqdm(unique_comments[:250])]
predictions = pd.concat(predictions).reset_index(drop = True)
predictions.tail(1)

Token indices sequence length is longer than the specified maximum sequence length for this model (756 > 512). Running this sequence through the model will result in indexing errors

(ggplot(predictions.melt('text'), aes(x = 'label', y = 'value'))
+ geom_boxplot(fill = 'orange', color = 'black')
)

<ggplot: (8734462526337)>

Comentários classificados como mais tóxicos

for txt in predictions.sort_values('toxic').tail(5)['text']:
    print('-'*20)
    print(txt)

--------------------
Fuck you people. The information of that can be found here []
--------------------
You have a fucking huge ego for a fucking geek with no life.
--------------------
comment from Sasayama 

go fuck ya mum!

fuck ya too

fuck off bitch!
--------------------
Ha, Fagget go fuck yourself homo.
--------------------
As you continue to be a complete arsehole i will keep callin you a queerhole fuckin qwernol and as far as im concerned you can just go and fuck yourself fuckin editor 
And fuck you again for sayin that im not actin in a civilised way you fuck head go fuck yourself and have a 'civilised conversation' wif ya mother you 40 year old virgin who still lives at home wif ya mother.

Comentários classificados como menos tóxicos

for txt in predictions.sort_values('toxic').head(5)['text']:
    print('-'*20)
    print(txt)

--------------------
My page 

I would like to thank you for reverting the vandalism to my page quickly it's most appreciated.
--------------------
I got rid of them just after I messaged you.   (or Hrothulf) (Talk)
--------------------
"

Rebuild of Evangelion revert warring

The concerned user has removed all the discussion (including my message) so I'm writing it back because it is valuable information about the nature of this user's actions:

I just wanted to draw your attention on the fact that user ""Arison Hiyama"" (ie the numerous IPs from Brazil that has recently edited the Rebuild of Evangelion articles) went as far as to vandalise the japanese wikipedia pages by adding herself the totally fake and ridiculous budgets she tries to include into the english articles (so that she could use it as a ""proof""). I think this is going further than mere beginner mistakes and we should do something instead of just repeating our warnings each time she changes IP.  "
--------------------
"

 Philip K. Dick 

Hi, Ed.  You remove the ""refimprove"" tag from PKD, which it clearly needs.  Also, please get in the habit of providing edit summaries; it helps those of us who use watchlists.  Thanks. — | Talk "
--------------------
"

Regarding this articles AfD nomination, I can end it early per WP:SNOW please do not revert that. by reverting it seems that you nominated this article in bad faith. poo "

	comment_id	text
7532	504235362	Go away, you annoying vandal.
7533	504235566	This user is a vandal.
7534	504308177	" \n\nSorry to sound like a pain, but one by following me here is ...
7535	504570375	Well it's pretty fucking irrelevant now I'm unblocked aint it! 81....
7536	504598250	The team name is Great Britain and Northern Ireland. Your blind re...

	worker	less_toxic	more_toxic
30103	461	I'm sorry. I'm not an admin. I will give you three pieces of advic...	get out my large penis
30104	527	I'm sorry. I'm not an admin. I will give you three pieces of advic...	get out my large penis
30105	352	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook
30106	311	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook
30107	54	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook

	worker	less_toxic	more_toxic	less_toxic_id	more_toxic_id
0	313	This article sucks \n\nwoo woo wooooooo	WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!!!!!!!!?????????????...	0	11527
1	188	"And yes, people should recognize that but they usually don't. One...	Daphne Guinness \n\nTop of the mornin' my favourite Alien Life Fo...	1	11528
2	82	Western Media?\n\nYup, because every crime in the entire world is...	"Atom you don't believe actual photos of masturbation is sexually ...	2	11529
3	347	And you removed it! You numbskull! I don't care what you say anymo...	You seem to have sand in your vagina.\n\nMight want to wash that s...	3	10805
4	539	smelly vagina \n\nBluerasberry why don't you be a model for a che...	hey \n\nway to support nazis, you racist	4	1562
...	...	...	...	...	...
30103	461	I'm sorry. I'm not an admin. I will give you three pieces of advic...	get out my large penis	11525	6840
30104	527	I'm sorry. I'm not an admin. I will give you three pieces of advic...	get out my large penis	11525	6840
30105	352	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook	11526	14243
30106	311	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook	11526	14243
30107	54	wow...\nare you out of your mind, how was my edit on your talk pag...	Piss off you slant eyed-gook	11526	14243

	worker	less_toxic	more_toxic	less_toxic_id	more_toxic_id	n	unanimous
9997-13873	688	"\nPS. You watch that foul mouth of yours Mada...	no reaction at all fucker????	9997	13873	3.0	True
9997-13873	238	"\nPS. You watch that foul mouth of yours Mada...	no reaction at all fucker????	9997	13873	3.0	True
9997-13873	75	"\nPS. You watch that foul mouth of yours Mada...	no reaction at all fucker????	9997	13873	3.0	True
9999-4668	406	Your editing\nWhy do you only edit Jewish arti...	What, I never agreed with you. You clearly kno...	9999	4668	2.0	False
9999-4668	673	Your editing\nWhy do you only edit Jewish arti...	What, I never agreed with you. You clearly kno...	9999	4668	2.0	False

unanimous	worker	False	True	total_reviews	unan_frac
0	0	59.0	70.0	129.0	0.542636
1	1	6.0	10.0	16.0	0.625000
2	2	8.0	15.0	23.0	0.652174
3	3	26.0	30.0	56.0	0.535714
4	4	45.0	51.0	96.0	0.531250
...	...	...	...	...	...
744	747	3.0	2.0	5.0	0.400000
745	748	8.0	7.0	15.0	0.466667
746	749	1.0	3.0	4.0	0.750000
748	751	13.0	26.0	39.0	0.666667
749	752	64.0	65.0	129.0	0.503876

unanimous	worker	False	True	total_reviews	unan_frac
593	596	1.0	7.0	8.0	0.875000
614	617	1.0	6.0	7.0	0.857143
595	598	1.0	6.0	7.0	0.857143
7	8	1.0	6.0	7.0	0.857143
221	222	4.0	21.0	25.0	0.840000
...	...	...	...	...	...
590	593	5.0	1.0	6.0	0.166667
720	723	5.0	1.0	6.0	0.166667
123	124	5.0	1.0	6.0	0.166667
455	458	5.0	1.0	6.0	0.166667
262	263	6.0	1.0	7.0	0.142857

	comment	len	toxic
0	sorry i jumped to conclusions \n\non christia...	250	least
1	How many sockpuppets do you have?	34	least
2	An addition comment of interest by Andrew Coyn...	683	least
3	Sockpuppet\nHi, would you please confirm this ...	218	least
4	Hello \n\nOK, don't try to frighten me with y...	224	least
...	...	...	...
995	“Citizenship is less notable” , “less interest...	629	most
996	, this was premeditated in concert with Hitler	46	most
997	STOP Threatning me. Why don't you falsely acc...	63	most
998	Nipple \n\nNipple Nipple Nipple Nipple Nipple...	4970	most
999	Bold textGREG THOMAS FROM ALBUQUERQUE NEW MEXI...	55	most

	text	lang
14239	Hey you fucking jew \nwhy the fuck did you get rid of emo kid beat...	English
14240	i hate your guts \n\ni hope you feel good about that	English
14241	":::fuck you animal! remember this shit yourself, who do you think...	English
14242	Some Say She is a little mothyer fukin bitchy	English
14243	Piss off you slant eyed-gook	English

	text	lang
418	STUPID SPANISH CENTRALISTSTUPID SPANISH CENTRALISTSTUPID SPANISH C...	un
809	Just To Let You Know. You have no life.\nI hate you.I hate you.I h...	un
848	Hitler was a fucking kike.	un
1228	sex fucksex fucksex fucksex fucksex fucksex fucksex fucksex fuckse...	un
1435	nikko smells nikko smells nikko smells nikko smells nikko smel...	un
1577	fuck u bitches.fuck u bitches.fuck u bitches.fuck u bitches.fuck u...	un
2627	Bold textYPU GUYS FUCKEN SUCK!!!	un
2926	Suck my balls\n\nSee title	un
3390	hey i like Jews hey i like Jews hey i like Jews hey i like Jews he...	un
4093	Nipple \n\nNipple Nipple Nipple Nipple Nipple Nipple Nipple Nipple...	un
4451	I don't fucking understand.	un
4559	{{Infobox Anatomy\n\| Name = Human penis\n\| Latin ...	un
4987	shut up you cunt\nWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW...	un
5411	Do I know you? ==BeCauSe You aRe a FGGT! == Do I know you? ==BeCau...	un
5439	Hi, ya fucking idiot. ^_^	un
5743	HOLA FUCKER \n\nHOLA FUCKER	un
6110	Fuck Chester Marcol \n\nFuck Chester MarcolFuck Chester MarcolFuck...	un
6865	FUCK VUVUZELAS FUCK VUVUZELAS FUCK VUVUZELAS FUCK VUVUZELAS FUCK V...	un
10206	Cream my pussy SexBot!!!	un
10584	What a bloody fool.What a bloody fool.What a bloody fool.What a bl...	un
10697	Craig Newmark\n\nCraig Newmark is a Kike, fuckhead!	un
10843	now\n\nyou fucking pathetic moron. \n\ncc: Jimbo Wales\n\nrootmyass	un
11565	im a fucking cunt! =]	un
11594	I am going to kill you. I am going to murder you. I am going to ki...	un
11600	I am a homo sexual. I am a homo sexual. I am a homo sexual. I a...	un
11606	ritual murder ritual murder	un
11762	'''Fuck You! Fuck You! Fuck You! Fuck You! Fuck You! Fuck You! Fuc...	un
11894	fuckin crikey fuck off ya fuckin bloke. Crikey.	un
11929	BULLSHIT Bullshit \nBULLSHIT Bullshit\nBULLSHIT Bullshit\nBULLSHIT...	un
11940	J.delanoy IS A NIGGER ==== J.delanoy IS A NIGGER ==== J.delanoy ...	un
11941	FUCK YOU, YOU ATHEIST CUNT! FUCK YOU, YOU ATHEIST CUNT! FUCK YOU, ...	un
11970	your a dickhead your a dickhead your a dickhead your a dickhead yo...	un
11996	Im a lousy donkey fucker. Fuck me, fuck me, fuck me!	un
12274	FUCK U BASTARD(TURD)	un
12279	FUCK FUCKITY FUCK FUCK FUCK	un
12305	FUCK WIKIPEDIA!!! FUCK WIKIPEDIA!!! FUCK WIKIPEDIA!!! FUCK WIKIPED...	un
12436	ANAL RAPE ANAL RAPE ANAL RAPE ANAL RAPE ANAL RAPE ...	un
12569	BunkSteve is gay! BunkSteve is gay! BunkSteve is gay! BunkSteve is...	un
12601	ROBERT ULLMANN SUCKS DICK ROBERT ULLMANN SUCKS DICK ROBERT ULLMANN...	un
12617	Andrew Kwon \n\nFuCKING FAG	un
12668	Go fuck yourself! Go fuck yourself! Go fuck yourself! Go fuck your...	un
12862	FUCK YOU ALL!! FUCK YOU ALL!! FUCK YOU ALL!! FUCK YOU ALL!! FU...	un
13261	Fucking administrators DAMN YOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUUUU	un
13283	FUCKING FAGGOT \n\nLOLWAT.	un
13307	SHUT THE FUCK UP!\nSHUT THE FUCK UP!\nSHUT THE FUCK UP!\nSHUT THE ...	un
13324	go fuck yourselfgo fuck yourselfgo fuck yourselfgo fuck yourselfgo...	un
13459	Do I know you? ==BeCauSe You aRe a FGGT!\n Do I know you? BeCauSe ...	un
13624	hola fucker \n\nhola fucker	un
13735	you can't keep me down nigger \nyou can't keep me down nigger \nyo...	un
13809	faggot faggot faggot faggot faggot faggot faggot faggot faggot fag...	un
13960	wikipedia loves cock \n\nwikipedia loves cock wikipedia loves cock...	un
14037	poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. ...	un
14121	MEXICANS SUCK MEXICANS SUCK MEXICANS SUCK MEXICANS SUCK MEXICANS S...	un
14154	MUAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHA, YOU CAN'T STOP MY VANDALI...	un

	text	lang
2129	bit im sheldon clarke	German
5365	Jew Marxist Raus Raus Raus.	German
14087	im gay\nim gay\nim gay\nim gay\nim gay\nim gay\nim gay\nim gay\nim...	German

	text	lang
2324	Yao Ming edit\n wanka	Quechua
6340	WANKER WANKER WANKER WANKER WANKER WANKER WANKER WANKER WA...	Quechua
9048	hey andy \n\nu r a wanka	Quechua

	n_comparisons	count
0	1	109
1	3	9431
2	4	77
3	6	3579
4	7	26
5	9	840
6	10	3
7	12	148
8	13	1
9	15	26
10	18	3
11	27	1