Skip to content

Commit

Permalink
Adding new manual sample for nlwiki
Browse files Browse the repository at this point in the history
  • Loading branch information
halfak committed Dec 13, 2021
2 parents ed260ae + ba72c4e commit 32ac926
Show file tree
Hide file tree
Showing 10 changed files with 452 additions and 275 deletions.
37 changes: 29 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ tuning_reports: \
trwiki_tuning_reports \
wikidatawiki_tuning_reports

wp10_major_minor = 0.8
wp10_major_minor = 0.9
page_level_major_minor = 0.3
item_quality_major_minor = 0.5

Expand Down Expand Up @@ -524,8 +524,29 @@ frwikisource_tuning_reports: \
datasets/nlwiki.balanced_labelings.1650_2021.json:
wget https://raw.githubusercontent.com/wikimedia/nlwiki_articlequality/master/datasets/nlwiki-20201101.balanced_sample.json -qO- > $@

datasets/nlwiki.labeled_revisions.w_cache.1650_2021.json: \
datasets/nlwiki.balanced_labelings.1650_2021.json
datasets/nlwiki.balanced_As_Bs_and_Es.from_balanced_labelings.1650_2021.json:
cat $< | grep -P '"wp10": "(A|B|E)" > $@

datasets/nlwiki.combined_labelings.1700_2021.json: \
datasets/nlwiki.balanced_As_Bs_and_Es.from_balanced_labelings.1650_2021.json \
datasets/nlwiki.human_labeled.manually_extracted.2021-09-23.json \
datasets/nlwiki.human_labeled.manually_etxracted.2021-12-12.json
cat $^ > $@

datasets/nlwiki.latest_scores.20210901.tsv:
./utility extract_scores /mnt/data/xmldatadumps/public/nlwiki/20210901/nlwiki-20210901-pages-articles?.xml-p*.bz2 \
--class-weight='"A"=5' --class-weight='"B"=4' --class-weight='"C"=3' --class-weight='"D"=2' --class-weight='"E"=1' \
--sunset=20210901000000 --model=models/nlwiki.wp10.gradient_boosting.model > $@

datasets/nlwiki.latest_scores.20210901.100_Cs.json: \
datasets/nlwiki.latest_scores.20210901.tsv
(head -n1 $< cat $< |
grep -P "\tC\t" |
grep -P -v "(\tLijst van)|(in het seizoen)" |
shuf -n 100) | tsv2json int str int str str float > $@

datasets/nlwiki.combined_labelings.1700_2021.w_cache.json: \
datasets/nlwiki.combined_labelings.1700_2021.json
cat $< | \
revscoring extract \
articlequality.feature_lists.nlwiki.wp10 \
Expand All @@ -534,7 +555,7 @@ datasets/nlwiki.labeled_revisions.w_cache.1650_2021.json: \


tuning_reports/nlwiki.wp10.md: \
datasets/nlwiki.labeled_revisions.w_cache.1650_2021.json
datasets/nlwiki.combined_labelings.1700_2021.w_cache.json
cat $< | \
revscoring tune \
config/classifiers.params.yaml \
Expand All @@ -551,17 +572,17 @@ tuning_reports/nlwiki.wp10.md: \
--debug > $@

models/nlwiki.wp10.gradient_boosting.model: \
datasets/nlwiki.labeled_revisions.w_cache.1650_2021.json
datasets/nlwiki.combined_labelings.1700_2021.w_cache.json
cat $< | \
revscoring cv_train \
revscoring.scoring.models.GradientBoosting \
articlequality.feature_lists.nlwiki.wp10 \
wp10 \
--version $(wp10_major_minor).0 \
-p 'max_depth=3' \
-p 'learning_rate=0.01' \
-p 'max_depth=5' \
-p 'learning_rate=0.1' \
-p 'max_features="log2"' \
-p 'n_estimators=300' \
-p 'n_estimators=700' \
--pop-rate '"E"=0.20' \
--pop-rate '"D"=0.20' \
--pop-rate '"C"=0.20' \
Expand Down
4 changes: 2 additions & 2 deletions articlequality/feature_lists/nlwiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
max(wikitext.revision.content_chars, 1),
cn_templates,
cn_templates / max(wikitext.revision.content_chars, 1),
infoboxes,
infoboxes / max(wikitext.revision.content_chars, 1)
infobox_templates,
infobox_templates / max(wikitext.revision.content_chars, 1)
]

wp10 = local_wiki + wikipedia.article
2 changes: 1 addition & 1 deletion articlequality/feature_lists/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.headings_by_level(3),
(wikitext.revision.headings_by_level(3) /
modifiers.max(wikitext.revision.content_chars, 1))
modifiers.max(wikitext.revision.content_chars, 1)),
wikitext.revision.list_items,
(wikitext.revision.list_items /
modifiers.max(wikitext.revision.content_chars, 1)),
Expand Down
64 changes: 64 additions & 0 deletions datasets/nlwiki.human_labels.manually_extracted.2021-09-23.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{"rev_id": 50697358, "page_title": "Simandraolo", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 50664054, "page_title": "Bumi Asih", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 58954675, "page_title": "Ptinus perplexus", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 57725157, "page_title": "Žasliai", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 54520210, "page_title": "Hindhead", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 57576297, "page_title": "Zgornji Janževski Vrh", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 35953904, "page_title": "Glen Davis", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 49980705, "page_title": "Ringuelet", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 40669768, "page_title": "Szellő", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 38819585, "page_title": "Libavské Údolí", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 40662863, "page_title": "Kiedrowice", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 37113664, "page_title": "Coccobius debachi", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 57570693, "page_title": "Hermanci", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 53755619, "page_title": "Cheam", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 42704313, "page_title": "Staré Bříště", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 55550580, "page_title": "Langangen", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 58777339, "page_title": "Hof bei Straden", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 50681819, "page_title": "Mandalasari (Cipatat)", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 40176205, "page_title": "Klášterská Lhota", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 53043278, "page_title": "Jeseník nad Odrou", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 54533523, "page_title": "Methode van Hardy Cross", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 58065364, "page_title": "Odilo Scherer", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 51950613, "page_title": "ATP-toernooi van Sydney 2013", "wp10": "C", "notes": "Kladblok"}
{"rev_id": 51057044, "page_title": "Katholieke Kerk in Malta", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 49478606, "page_title": "Castiglione in Teverina", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 55947453, "page_title": "NGC 1957", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 44820809, "page_title": "Lijst van biologische bestrijders en bestuivers", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 31333752, "page_title": "Socotá", "wp10": "E", "notes": "Kladblok - Disambiguation page"}
{"rev_id": 58687539, "page_title": "Wijngaarden (Friesland)", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 58441243, "page_title": "Breux-Jouy", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 55878515, "page_title": "Bisschoppelijk Paleis van Roermond", "wp10": "E", "notes": "Kladblok - Disambiguation page"}
{"rev_id": 58441156, "page_title": "Keane Barry", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 48799725, "page_title": "Pouillac", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 58155398, "page_title": "Eva Gabrielsson", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 36071420, "page_title": "Tucker", "wp10": "E", "notes": "Kladblok - Disambiguation page"}
{"rev_id": 58980134, "page_title": "ARD", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 51012264, "page_title": "Elachistocleis", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 35737742, "page_title": "Stand (klimsport)", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 51911364, "page_title": "Radicaal 97", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 36557124, "page_title": "Moimenta (Vinhais)", "wp10": "E", "notes": "Kladblok"}
{"rev_id": 57048635, "page_title": "De Cock en moord op bestelling", "wp10": "D", "notes": "Kladblok"}
{"rev_id": 58550657, "page_title": "Hou Yifan", "wp10": "B", "notes": "Kladblok"}
{"rev_id": 58934483, "page_title": "Tom Morello", "wp10": "B", "notes": "Kladblok"}
{"rev_id": 58338464, "page_title": "Ovidius", "wp10": "B", "notes": "Kladblok"}
{"rev_id": 58084625, "page_title": "Aart Arnout van Schelven", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 57758740, "page_title": "Nationale Koninklijke Beweging", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 18567817, "page_title": "Kempisty", "wp10": "E", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 49175951, "page_title": "Dresus", "wp10": "E", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 55630110, "page_title": "Gemarkung", "wp10": "E", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59554400, "page_title": "Grijze Wolven", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59632848, "page_title": "Tagamõisa (schiereiland)", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 37021310, "page_title": "Machimia coccoscela", "wp10": "E", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 54531368, "page_title": "Altmuehlopterus", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59651986, "page_title": "Värska (plaats)", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 55825776, "page_title": "Natasha Hansen", "wp10": "D", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 58910325, "page_title": "Wintersport in Californië", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59694093, "page_title": "Lijst van snelwegparkings in België", "wp10": "D", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59721874, "page_title": "Beleg van Leiden (1573-1574)", "wp10": "A", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59733250, "page_title": "Gifsumak", "wp10": "D", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59735611, "page_title": "Gifsumak", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 50727188, "page_title": "Adventief (morfologie)", "wp10": "D", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59756647, "page_title": "Lijst van wapens van Estische gemeenten", "wp10": "D", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59059432, "page_title": "Boyshorts", "wp10": "C", "notes": "Vreemde eenden in de bijt"}
{"rev_id": 59900095, "page_title": "Kwantitatieve versoepeling", "wp10": "B", "notes": "Vreemde eenden in de bijt"}
56 changes: 56 additions & 0 deletions datasets/nlwiki.human_labels.manually_extracted.2021-12-12.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{"page_name": "Battle Mountain", "rev_id": 52460837, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Don't Change Your Husband", "rev_id": 45963723, "wp10": "E", "notes": "Kladblok"}
{"page_name": "Rally van Sardinië 2011", "rev_id": 59732009, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Meritsjleri", "rev_id": 58471630, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Coffee & Co", "rev_id": 58779170, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Holebikort", "rev_id": 59396686, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Station Herne (België)", "rev_id": 58565985, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Ingeborg Sæhlie", "rev_id": 42137552, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Errindlev (parochie)", "rev_id": 47196595, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Buys Ballotmedaille", "rev_id": 53504055, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Donor-acceptorbinding", "rev_id": 49955610, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Sint-Radegundiskerk", "rev_id": 57093404, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Alatina tetraptera", "rev_id": 54393173, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Fréville (Seine-Maritime)", "rev_id": 57517234, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Destination Berlin (Tangerine Dream)", "rev_id": 46462957, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Symfonie nr. 46 (Hovhaness)", "rev_id": 49373324, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Mickey Wright", "rev_id": 55728220, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Anduze", "rev_id": 59702047, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Rosemary's Sons", "rev_id": 58687040, "wp10": "D", "notes": "Kladblok"}
{"page_name": "AOX", "rev_id": 51766282, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Durgerdam slaapt", "rev_id": 57759682, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Maria Oespenskaja", "rev_id": 44201093, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Bundesstraße 480", "rev_id": 55091442, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Criollo (paard)", "rev_id": 58380140, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Collectie Landschapskunst Flevoland", "rev_id": 59454995, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Scott Robinson (zanger)", "rev_id": 52430166, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Jordi Balk", "rev_id": 55862634, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Bianca Knight", "rev_id": 47160117, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Asha Gigi", "rev_id": 49564281, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Hugo Bosch", "rev_id": 55100269, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Thomas Borgmann", "rev_id": 58132041, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Geister Rikscha", "rev_id": 55458997, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Factorio", "rev_id": 58196935, "wp10": "C", "notes": "Kladblok"}
{"page_name": "SV Sodingen 1912", "rev_id": 59717447, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Shaun Wright-Phillips", "rev_id": 59770335, "wp10": "C", "notes": "Kladblok"}
{"page_name": "N619 (België)", "rev_id": 47071349, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Elies Lemkes-Straver", "rev_id": 59442606, "wp10": "C", "notes": "Kladblok"}
{"page_name": "MTV Unplugged in New York", "rev_id": 57445549, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Lena Nuding", "rev_id": 58824695, "wp10": "E", "notes": "Kladblok"}
{"page_name": "Baldwin (Pennsylvania)", "rev_id": 49794839, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Deutscher Soldatenfriedhof Fricourt", "rev_id": 56554717, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Portugal op het Eurovisiesongfestival 1977", "rev_id": 59426404, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Josef Anton Schobinger", "rev_id": 58046129, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Gustaaf Hermans", "rev_id": 58654831, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Bestuurlijke indeling van Nagorno-Karabach", "rev_id": 57392637, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Ulster Grand Prix 1960", "rev_id": 52652958, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Maatschap", "rev_id": 59015465, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Blink", "rev_id": 59552706, "wp10": "E", "notes": "Kladblok"}
{"page_name": "Wereldkampioenschappen schermen 2010", "rev_id": 59760454, "wp10": "E", "notes": "Kladblok"}
{"page_name": "Stichtsche Cricket en Hockey Club", "rev_id": 58651601, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Wormer (dorp)", "rev_id": 58860090, "wp10": "B", "notes": "Kladblok"}
{"page_name": "Die Hard 2", "rev_id": 56608497, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Exploitatie Maatschappij Scheveningen", "rev_id": 58879142, "wp10": "D", "notes": "Kladblok"}
{"page_name": "Charlesville (schip, 1951)", "rev_id": 57400871, "wp10": "C", "notes": "Kladblok"}
{"page_name": "Etnologische tentoonstelling", "rev_id": 58338441, "wp10": "C", "notes": "Kladblok"}
{"page_name": "A36", "rev_id": 35891216, "wp10": "E", "notes": "Kladblok"}
Loading

0 comments on commit 32ac926

Please sign in to comment.