From f28ed311c897d5e52eb5ff0238fd44f1b6550ad1 Mon Sep 17 00:00:00 2001
From: Aaron Halfaker <ahalfaker@wikimedia.org>
Date: Tue, 14 Jul 2020 19:54:41 +0000
Subject: [PATCH] Trains enwiki reverted_for_damage model

---
 Makefile                                      | 31 +++++++
 model_info/enwiki.reverted.md                 | 81 +++++++++++++++++++
 .../enwiki.reverted.gradient_boosting.model   |  3 +
 3 files changed, 115 insertions(+)
 create mode 100644 model_info/enwiki.reverted.md
 create mode 100644 models/enwiki.reverted.gradient_boosting.model

diff --git a/Makefile b/Makefile
index a209df1..9a490d7 100644
--- a/Makefile
+++ b/Makefile
@@ -913,6 +913,37 @@ models/enwiki.damaging.gradient_boosting.model: \
 
 	revscoring model_info $@ > model_info/enwiki.damaging.md
 
+
+datasets/enwiki.autolabeled_revisions.w_cache.20k_2015.json: \
+		datasets/enwiki.labeled_revisions.w_cache.20k_2015.json
+	cat $< | \
+        ./utility autolabel --host=https://en.wikipedia.org \
+                --trusted-groups=bot,bureaucrat,sysop \
+                --trusted-edits=1000 \
+                --revert-radius=5 \
+                --verbose > $@
+
+
+models/enwiki.reverted.gradient_boosting.model: \
+		datasets/enwiki.autolabeled_revisions.w_cache.20k_2015.json
+	cat $< | \
+	revscoring cv_train \
+		revscoring.scoring.models.GradientBoosting \
+		editquality.feature_lists.enwiki.damaging \
+		reverted_for_damage \
+		--version=$(damaging_major_minor).1 \
+		-p 'learning_rate=0.01' \
+		-p 'max_depth=7' \
+		-p 'max_features="log2"' \
+		-p 'n_estimators=700' \
+		--label-weight $(damaging_weight) \
+		--pop-rate "true=0.034163555464634586" \
+		--pop-rate "false=0.9658364445353654" \
+		--center --scale > $@
+
+	revscoring model_info $@ > model_info/enwiki.reverted.md
+
+
 tuning_reports/enwiki.goodfaith.md: \
 		datasets/enwiki.labeled_revisions.w_cache.20k_2015.json
 	cat $< | \
diff --git a/model_info/enwiki.reverted.md b/model_info/enwiki.reverted.md
new file mode 100644
index 0000000..8c9119a
--- /dev/null
+++ b/model_info/enwiki.reverted.md
@@ -0,0 +1,81 @@
+Model Information:
+	 - type: GradientBoosting
+	 - version: 0.5.1
+	 - params: {'n_iter_no_change': None, 'validation_fraction': 0.1, 'ccp_alpha': 0.0, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 700, 'min_impurity_split': None, 'init': None, 'subsample': 1.0, 'multilabel': False, 'label_weights': OrderedDict([(True, 10)]), 'max_depth': 7, 'scale': True, 'max_features': 'log2', 'tol': 0.0001, 'presort': 'deprecated', 'labels': [True, False], 'min_samples_split': 2, 'population_rates': None, 'warm_start': False, 'center': True, 'criterion': 'friedman_mse', 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'loss': 'deviance', 'learning_rate': 0.01, 'random_state': None, 'verbose': 0, 'min_impurity_decrease': 0.0}
+	Environment:
+	 - revscoring_version: '2.8.2'
+	 - platform: 'Linux-4.9.0-12-amd64-x86_64-with-debian-9.12'
+	 - machine: 'x86_64'
+	 - version: '#1 SMP Debian 4.9.210-1+deb9u1 (2020-06-07)'
+	 - system: 'Linux'
+	 - processor: ''
+	 - python_build: ('default', 'Sep 27 2018 17:25:39')
+	 - python_compiler: 'GCC 6.3.0 20170516'
+	 - python_branch: ''
+	 - python_implementation: 'CPython'
+	 - python_revision: ''
+	 - python_version: '3.5.3'
+	 - release: '4.9.0-12-amd64'
+	
+	Statistics:
+	counts (n=19205):
+		label        n         ~True    ~False
+		-------  -----  ---  -------  --------
+		True      1247  -->      745       502
+		False    17958  -->     1692     16266
+	rates:
+		              True    False
+		----------  ------  -------
+		sample       0.065    0.935
+		population   0.034    0.966
+	match_rate (micro=0.862, macro=0.5):
+		  True    False
+		------  -------
+		 0.111    0.889
+	filter_rate (micro=0.138, macro=0.5):
+		  True    False
+		------  -------
+		 0.889    0.111
+	recall (micro=0.895, macro=0.752):
+		  True    False
+		------  -------
+		 0.597    0.906
+	!recall (micro=0.608, macro=0.752):
+		  True    False
+		------  -------
+		 0.906    0.597
+	precision (micro=0.957, macro=0.584):
+		  True    False
+		------  -------
+		 0.183    0.985
+	!precision (micro=0.211, macro=0.584):
+		  True    False
+		------  -------
+		 0.985    0.183
+	f1 (micro=0.921, macro=0.612):
+		  True    False
+		------  -------
+		  0.28    0.944
+	!f1 (micro=0.303, macro=0.612):
+		  True    False
+		------  -------
+		 0.944     0.28
+	accuracy (micro=0.895, macro=0.895):
+		  True    False
+		------  -------
+		 0.895    0.895
+	fpr (micro=0.392, macro=0.248):
+		  True    False
+		------  -------
+		 0.094    0.403
+	roc_auc (micro=0.862, macro=0.862):
+		  True    False
+		------  -------
+		 0.862    0.862
+	pr_auc (micro=0.97, macro=0.647):
+		  True    False
+		------  -------
+		   0.3    0.993
+	
+	 - score_schema: {'properties': {'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}, 'probability': {'description': 'A mapping of probabilities onto each of the potential output labels', 'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}, 'type': 'object'}}, 'title': 'Scikit learn-based classifier score with probability', 'type': 'object'}
+
diff --git a/models/enwiki.reverted.gradient_boosting.model b/models/enwiki.reverted.gradient_boosting.model
new file mode 100644
index 0000000..960e476
--- /dev/null
+++ b/models/enwiki.reverted.gradient_boosting.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e7a32952e91ab918c39384777c7a14d78eb3e93d2552ea195702c1edde3b7b
+size 10822946