From f28ed311c897d5e52eb5ff0238fd44f1b6550ad1 Mon Sep 17 00:00:00 2001 From: Aaron Halfaker Date: Tue, 14 Jul 2020 19:54:41 +0000 Subject: [PATCH] Trains enwiki reverted_for_damage model --- Makefile | 31 +++++++ model_info/enwiki.reverted.md | 81 +++++++++++++++++++ .../enwiki.reverted.gradient_boosting.model | 3 + 3 files changed, 115 insertions(+) create mode 100644 model_info/enwiki.reverted.md create mode 100644 models/enwiki.reverted.gradient_boosting.model diff --git a/Makefile b/Makefile index a209df1..9a490d7 100644 --- a/Makefile +++ b/Makefile @@ -913,6 +913,37 @@ models/enwiki.damaging.gradient_boosting.model: \ revscoring model_info $@ > model_info/enwiki.damaging.md + +datasets/enwiki.autolabeled_revisions.w_cache.20k_2015.json: \ + datasets/enwiki.labeled_revisions.w_cache.20k_2015.json + cat $< | \ + ./utility autolabel --host=https://en.wikipedia.org \ + --trusted-groups=bot,bureaucrat,sysop \ + --trusted-edits=1000 \ + --revert-radius=5 \ + --verbose > $@ + + +models/enwiki.reverted.gradient_boosting.model: \ + datasets/enwiki.autolabeled_revisions.w_cache.20k_2015.json + cat $< | \ + revscoring cv_train \ + revscoring.scoring.models.GradientBoosting \ + editquality.feature_lists.enwiki.damaging \ + reverted_for_damage \ + --version=$(damaging_major_minor).1 \ + -p 'learning_rate=0.01' \ + -p 'max_depth=7' \ + -p 'max_features="log2"' \ + -p 'n_estimators=700' \ + --label-weight $(damaging_weight) \ + --pop-rate "true=0.034163555464634586" \ + --pop-rate "false=0.9658364445353654" \ + --center --scale > $@ + + revscoring model_info $@ > model_info/enwiki.reverted.md + + tuning_reports/enwiki.goodfaith.md: \ datasets/enwiki.labeled_revisions.w_cache.20k_2015.json cat $< | \ diff --git a/model_info/enwiki.reverted.md b/model_info/enwiki.reverted.md new file mode 100644 index 0000000..8c9119a --- /dev/null +++ b/model_info/enwiki.reverted.md @@ -0,0 +1,81 @@ +Model Information: + - type: GradientBoosting + - version: 0.5.1 + - params: {'n_iter_no_change': None, 'validation_fraction': 0.1, 'ccp_alpha': 0.0, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 700, 'min_impurity_split': None, 'init': None, 'subsample': 1.0, 'multilabel': False, 'label_weights': OrderedDict([(True, 10)]), 'max_depth': 7, 'scale': True, 'max_features': 'log2', 'tol': 0.0001, 'presort': 'deprecated', 'labels': [True, False], 'min_samples_split': 2, 'population_rates': None, 'warm_start': False, 'center': True, 'criterion': 'friedman_mse', 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'loss': 'deviance', 'learning_rate': 0.01, 'random_state': None, 'verbose': 0, 'min_impurity_decrease': 0.0} + Environment: + - revscoring_version: '2.8.2' + - platform: 'Linux-4.9.0-12-amd64-x86_64-with-debian-9.12' + - machine: 'x86_64' + - version: '#1 SMP Debian 4.9.210-1+deb9u1 (2020-06-07)' + - system: 'Linux' + - processor: '' + - python_build: ('default', 'Sep 27 2018 17:25:39') + - python_compiler: 'GCC 6.3.0 20170516' + - python_branch: '' + - python_implementation: 'CPython' + - python_revision: '' + - python_version: '3.5.3' + - release: '4.9.0-12-amd64' + + Statistics: + counts (n=19205): + label n ~True ~False + ------- ----- --- ------- -------- + True 1247 --> 745 502 + False 17958 --> 1692 16266 + rates: + True False + ---------- ------ ------- + sample 0.065 0.935 + population 0.034 0.966 + match_rate (micro=0.862, macro=0.5): + True False + ------ ------- + 0.111 0.889 + filter_rate (micro=0.138, macro=0.5): + True False + ------ ------- + 0.889 0.111 + recall (micro=0.895, macro=0.752): + True False + ------ ------- + 0.597 0.906 + !recall (micro=0.608, macro=0.752): + True False + ------ ------- + 0.906 0.597 + precision (micro=0.957, macro=0.584): + True False + ------ ------- + 0.183 0.985 + !precision (micro=0.211, macro=0.584): + True False + ------ ------- + 0.985 0.183 + f1 (micro=0.921, macro=0.612): + True False + ------ ------- + 0.28 0.944 + !f1 (micro=0.303, macro=0.612): + True False + ------ ------- + 0.944 0.28 + accuracy (micro=0.895, macro=0.895): + True False + ------ ------- + 0.895 0.895 + fpr (micro=0.392, macro=0.248): + True False + ------ ------- + 0.094 0.403 + roc_auc (micro=0.862, macro=0.862): + True False + ------ ------- + 0.862 0.862 + pr_auc (micro=0.97, macro=0.647): + True False + ------ ------- + 0.3 0.993 + + - score_schema: {'properties': {'prediction': {'description': 'The most likely label predicted by the estimator', 'type': 'boolean'}, 'probability': {'description': 'A mapping of probabilities onto each of the potential output labels', 'properties': {'false': {'type': 'number'}, 'true': {'type': 'number'}}, 'type': 'object'}}, 'title': 'Scikit learn-based classifier score with probability', 'type': 'object'} + diff --git a/models/enwiki.reverted.gradient_boosting.model b/models/enwiki.reverted.gradient_boosting.model new file mode 100644 index 0000000..960e476 --- /dev/null +++ b/models/enwiki.reverted.gradient_boosting.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e7a32952e91ab918c39384777c7a14d78eb3e93d2552ea195702c1edde3b7b +size 10822946