diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..ca63d1dc
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,40 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {            
+            "name": "Python: Attach",
+            "type": "python",
+            "request": "attach",
+            "connect": {
+              "host": "localhost",
+              "port": 5678
+            }            
+        },
+        {
+            "name": "Python: Module",
+            "type": "python",
+            "request": "launch",
+            "module": "code",
+            "cwd": "${workspaceFolder}",
+        },
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            // "pythonArgs": [
+            //     "-m", 
+            //     "src.feature_extraction.test.feature_extraction_test", 
+            //     "E:\\MyPC\\code\\git\\myforkMLiP\\MLinPractice\\src\\feature_extraction\\test\\feature_extraction_test.py"
+            // ],
+            // "env": {
+            //     "PYTHONPATH": "${workspaceFolder}/code"
+            // }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index f1c12d81..ea9e9aff 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ conda install -y -q -c conda-forge gensim=4.1.2
 conda install -y -q -c conda-forge spyder=5.1.5
 conda install -y -q -c conda-forge pandas=1.1.5
 conda install -y -q -c conda-forge mlflow=1.20.2
+conda install -y -q -c conda-forge vaderSentiment=3.3.2
 ```
 
 You can double-check that all of these packages have been installed by running `conda list` inside of your virtual environment. The Spyder IDE can be started by typing `~/miniconda/envs/MLinPractice/bin/spyder` in your terminal window (assuming you use miniconda, which is installed right in your home directory).
@@ -27,25 +28,68 @@ In order to save some space on your local machine, you can run `conda clean -y -
 
 The installed libraries are used for machine learning (`scikit-learn`), visualizations (`matplotlib`), NLP (`nltk`), word embeddings (`gensim`), and IDE (`spyder`), and data handling (`pandas`)
 
-## Overall Pipeline
+## Setup
 
-The overall pipeline can be executed with the script `code/pipeline.sh`, which executes all of the following shell scripts:
-- The script `code/load_data.sh` downloads the raw csv files containing the tweets and their metadata. They are stored in the folder `data/raw/` (which will be created if it does not yet exist).
-- The script `code/preprocessing.sh` executes all necessary preprocessing steps, including a creation of labels and splitting the data set.
-- The script `code/feature_extraction.sh` takes care of feature extraction.
-- The script `code/dimensionality_reduction.sh` takes care of dimensionality reduction.
-- The script `code/classification.sh` takes care of training and evaluating a classifier.
-- The script `code/application.sh` launches the application example.
+First if all, the shell script `src/setup.sh` needs to be run once before the actual `src/pipeline.sh` script or any other shell scripts can be executed. The setup script downloads necessary data by executing the scripts `src/load_data.sh` and `src/load_nltk_data.sh`.  
+- The former script `src/load_data.sh` downloads the Data Science Tweets as raw csv files containing the tweets and their metadata. They are stored in the directory `data/raw/` (which will be created if it does not yet exist).
+- The latter script `src/load_nltk_data.sh` downloads necessary NLTK data sets, corpora and models (see more: [nltk.org/data.html](https://www.nltk.org/data.html))
+
+## Running Scripts and Unit Tests
+
+To run bash scripts you need to open a bash shell. On Unix systems (Linux and MacOS) such a shell comes already with the operating system. On Windows this needs to be installed manually. When you install git, the git bash shell will be installed as well. Once you open a terminal window, you can either directly write the path to the script you want to execute or preprend `bash` before it. Both of the following example commands should work:
+
+```console
+./src/setup.sh
+```
+
+```console
+bash ./src/setup.sh
+```
+
+In case this throws an error like `permission denied` or sth similar, you might need to change the access level of some files. This can be done by executing the following command:
+
+```console
+chmod -R a+x ./src
+```
+
+This gives access rights to all users for all files (recursively) in the src directory.
+
+### Pipeline Scripts
+
+The overall pipeline can be executed with the script `src/pipeline.sh`, which executes all of the following shell scripts:
+- `src/preprocessing.sh`: Executes all necessary preprocessing steps, including a creation of labels and splitting the data set.
+- `src/feature_extraction.sh`: Takes care of feature extraction.
+- `src/dimensionality_reduction.sh`: Takes care of dimensionality reduction.
+- `src/classification.sh`: Takes care of training and evaluating a classifier. The scripts specifies one of 5 possible classification scenarios. 4 of them are commented out. Comment in or out the so that only one scenario will be run for training on the training set. Additionally the same classifier is used for evaluation on the validation set.
+
+### Additional Scripts
+- `src/application.sh`: Launches the application example.
+- `src/classification_hyper_param.sh`: Trains and evaluates two classifiers over a predefined range of parameters (grid search)
+- `src/final_classification.sh`: Trains the best two classifiers on the training .data set and afterwards evaluates the performance on the test data set in comparison to the *stratified* baseline.
+- `src/setup.sh`: As mentioned above in detail, downloads necessary data.
+
+### Unit Tests 
+
+The following command runs all unit tests in the `src`directory for files that end in the file name `_test.py`:
+
+```bash
+python -m unittest discover -s src -p '*_test.py'
+```
 
 ## Preprocessing
 
-All python scripts and classes for the preprocessing of the input data can be found in `code/preprocessing/`.
+All python scripts and classes for the preprocessing of the input data can be found in [`src/preprocessing/`](src/preprocessing/).
 
 ### Creating Labels
 
-The script `create_labels.py` assigns labels to the raw data points based on a threshold on a linear combination of the number of likes and retweets. It is executed as follows:
-```python -m code.preprocessing.create_labels path/to/input_dir path/to/output.csv```
-Here, `input_dir` is the directory containing the original raw csv files, while `output.csv` is the single csv file where the output will be written.
+The script [`create_labels.py`](src/preprocessing/create_labels.py) assigns labels to the raw data points based on a threshold on a linear combination of the number of likes and retweets. It is executed as follows:
+
+```bash
+python -m src.preprocessing.create_labels path/to/input_dir path/to/output.csv
+```
+
+Here, `input_dir` is the directory containing the original raw csv files, while `output.csv` is the single csv file where the output will be stored.
+
 The script takes the following optional parameters:
 - `-l` or `--likes_weight` determines the relative weight of the number of likes a tweet has received. Defaults to 1.
 - `-r` or `--retweet_weight` determines the relative weight of the number of retweets a tweet has received. Defaults to 1.
@@ -53,20 +97,32 @@ The script takes the following optional parameters:
 
 ### Classical Preprocessing
 
-The script `run_preprocessing.py` is used to run various preprocessing steps on the raw data, producing additional columns in the csv file. It is executed as follows:
-```python -m code.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv```
+The script [`run_preprocessing.py`](src/preprocessing/run_preprocessing.py) is used to run various preprocessing steps on the raw data, producing additional columns in the csv file. It is executed as follows:
+
+```bash
+python -m src.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv
+```
+
 Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), while `output.csv` is the csv file where the output will be written.
-The preprocessing steps to take can be configured with the following flags:
-- `-p` or `--punctuation`: A new column "tweet_no_punctuation" is created, where all punctuation is removed from the original tweet. (See `code/preprocessing/punctuation_remover.py` for more details)
-- `-t`or `--tokenize`: Tokenize the given column (can be specified by `--tokenize_input`, default = "tweet"), and create new column with suffix "_tokenized" containing tokenized tweet.
+
+The following flags configure which preprocessing steps are applied:
+
+- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/preprocessors/punctuation_remover.py) for more details)
+- `-t` or `--tokenize`: Tokenize the given column (can be specified by `--tokenize_input`, default = "tweet"), and create new column with suffix "_tokenized" containing tokenized tweet.
+- `-o` or `--other`: Executes all the other preprocessing steps like the removal of non english records and the removal of unnecessary columns.
 
 Moreover, the script accepts the following optional parameters:
+
 - `-e` or `--export` gives the path to a pickle file where an sklearn pipeline of the different preprocessing steps will be stored for later usage.
 
 ### Splitting the Data Set
 
-The script `split_data.py` splits the overall preprocessed data into training, validation, and test set. It can be invoked as follows:
-```python -m code.preprocessing.split_data path/to/input.csv path/to/output_dir```
+The script [`split_data.py`](src/preprocessing/split_data.py) splits the overall preprocessed data into training, validation, and test set. It can be invoked as follows:
+
+```bash
+python -m src.preprocessing.split_data path/to/input.csv path/to/output_dir
+```
+
 Here, `input.csv` is the input csv file to split (containing a column "label" with the label information, i.e., `create_labels.py` needs to be run beforehand) and `output_dir` is the directory where three individual csv files `training.csv`, `validation.csv`, and `test.csv` will be stored.
 The script takes the following optional parameters:
 - `-t` or `--test_size` determines the relative size of the test set and defaults to 0.2 (i.e., 20 % of the data).
@@ -76,17 +132,21 @@ The script takes the following optional parameters:
 
 ## Feature Extraction
 
-All python scripts and classes for feature extraction can be found in `code/feature_extraction/`.
+All python scripts and classes for feature extraction can be found in [`src/feature_extraction/`](src/feature_extraction).
+
+The script [`extract_features.py`](src/feature_extraction/extract_features.py) takes care of the overall feature extraction process and can be invoked as follows:
+
+```bash
+python -m src.feature_extraction.extract_features path/to/input.csv path/to/output.pickle
+```
 
-The script `extract_features.py` takes care of the overall feature extraction process and can be invoked as follows:
-```python -m code.feature_extraction.extract_features path/to/input.csv path/to/output.pickle```
 Here, `input.csv` is the respective training, validation, or test set file created by `split_data.py`. The file `output.pickle` will be used to store the results of the feature extraction process, namely a dictionary with the following entries:
 - `"features"`: a numpy array with the raw feature values (rows are training examples, colums are features)
 - `"feature_names"`: a list of feature names for the columns of the numpy array
 - `"labels"`: a numpy array containing the target labels for the feature vectors (rows are training examples, only column is the label)
 
 The features to be extracted can be configured with the following optional parameters:
-- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see code/feature_extraction/character_length.py)
+- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [`character_length.py`](src/feature_extraction/feature_extractors/character_length.py))
 
 Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments:
 - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract.
@@ -94,11 +154,14 @@ Moreover, the script support importing and exporting fitted feature extractors w
 
 ## Dimensionality Reduction
 
-All python scripts and classes for dimensionality reduction can be found in `code/dimensionality_reduction/`.
+All python scripts and classes for dimensionality reduction can be found in [`src/dimensionality_reduction/`](src/dimensionality_reduction/).
 
-The script `reduce_dimensionality.py` takes care of the overall dimensionality reduction procedure and can be invoked as follows:
+The script [`reduce_dimensionality.py`](src/dimensionality_reduction/reduce_dimensionality.py) takes care of the overall dimensionality reduction procedure and can be invoked as follows:
+
+```
+python -m src.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle
+```
 
-```python -m code.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle```
 Here, `input.pickle` is the respective training, validation, or test set file created by `extract_features.py`. 
 The file `output.pickle` will be used to store the results of the dimensionality reduction process, containing `"features"` (which are the selected/projected ones) and `"labels"` (same as in the input file).
 
@@ -113,35 +176,112 @@ Finally, if the flag `--verbose` is set, the script outputs some additional info
 
 ## Classification
 
-All python scripts and classes for classification can be found in `code/classification/`.
+All python scripts and classes for classification can be found in [`src/classification/`](src/classification/).
 
 ### Train and Evaluate a Single Classifier
 
-The script `run_classifier.py` can be used to train and/or evaluate a given classifier. It can be executed as follows:
-```python -m code.classification.run_classifier path/to/input.pickle```
-Here, `input.pickle` is a pickle file of the respective data subset, produced by either `extract_features.py` or `reduce_dimensionality.py`. 
+The script [`run_classifier.py`](src/classification/run_classifier.py) can be used to train and/or evaluate a given classifier. It can be executed as follows:
 
-By default, this data is used to train a classifier, which is specified by one of the following optional arguments:
-- `-m` or `--majority`: Majority vote classifier that always predicts the majority class.
-- `-f` or `--frequency`: Dummy classifier that makes predictions based on the label frequency in the training data.
-
-The classifier is then evaluated, using the evaluation metrics as specified through the following optional arguments:
-- `-a`or `--accuracy`: Classification accurracy (i.e., percentage of correctly classified examples).
-- `-k`or `--kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement).
+```
+python -m src.classification.run_classifier path/to/input.pickle
+```
 
+Here, `input.pickle` is a pickle file of the respective data subset, produced by either `extract_features.py` or `reduce_dimensionality.py`. 
 
-Moreover, the script support importing and exporting trained classifiers with the following optional arguments:
+Support **importing and exporting trained classifiers** with the following optional arguments:
 - `-i` or `--import_file`: Load a trained classifier from the given pickle file. Ignore all parameters that configure the classifier to use and don't retrain the classifier.
 - `-e` or `--export_file`: Export the trained classifier into the given pickle file.
 
+
+By default, this data is used to train a **classifier**. It is possible to chose 1 of 5 different scenarios for training. Either select one of two Two dummy classifiers as a baseline or the knn or random forest classifier. For the random forest classifier it can be additionally specified to either perform a grid search or not.
+
+Dummy Classifier (baselines)
+- `-d` or `--dummyclassifier` followed by either `most_frequent` or `stratified`
+  - `most_frequent` is a [_DummyClassifier_](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) which always predicts the most frequently occuring label in the training set.
+  - `stratified` is a [_DummyClassifier_](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) that makes predictions based on the label frequency in the training data (respects the training set’s class distribution).
+
+Chose one of three possible options for the classification:
+- `--knn` followed by the a interger number of k
+- `-r` or `--randomforest` followed by a integer number of trees
+  - if `--sk_gridsearch_rf` is omitted a normal random forest classifier with the provided number of trees will be used for training.
+  - if `--sk_gridsearch_rf` is present, a grid search on a random forest classifier with a predefined (hardcoded) range of parameters is performed. Als the the number of trees is still expected, but will be ignored.
+
+**Evaluation metrics** are then used by the classifier. Which metrics to use for evaluation can be specified with the following optional arguments:
+- `-m` or `--metrics` followed by another option (default is `kappa`):
+  - `none` no metrics will be used
+  - `all` all metrics will be used
+  - `accuracy`: Classification accurracy (i.e., percentage of correctly classified examples).
+  - `kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement).
+  - `precision`: Precision (ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative)
+  - `recall` Recall (the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples)
+  - `f1`: F1-score (weighted average of precision and recall)
+  - `jaccard`: Jaccard score (the size of the intersection divided by the size of the union of two label sets)
+
+For more details on the metrics used, see: https://scikit-learn.org/stable/modules/classes.html#classification-metrics
+
+Logging with MlFlow:
+
+- `--log_folder` specifies where MlFlow will store its logging files. Default is `data/classification/mlflow`.
+- `-n` or `--run_name` specifies a name for the classification run, so that runs can be identified afterwards when looking at the results in the MlFlow user interface.
+
 Finally, the optional argument `-s` or `--seed` determines the seed for intializing the random number generator (which may be important for some classifiers). 
 Using the same seed across multiple runs ensures reproducibility of the results. If no seed is set, the current system time will be used.
 
 ## Application
 
-All python code for the application demo can be found in `code/application/`.
+All python code for the application demo can be found in [`src/application/`](src/application/).
 
-The script `application.py` provides a simple command line interface, where the user is asked to type in their prospective tweet, which is then analyzed using the trained ML pipeline.
+The script [`application.py`](src/application/application.py) provides a simple command line interface, where the user is asked to type in their prospective tweet, which is then analyzed using the trained ML pipeline.
 The script can be invoked as follows:
-```python -m code.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle```
+
+```
+python -m src.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle
+```
+
 The four pickle files correspond to the exported versions for the different pipeline steps as created by `run_preprocessing.py`, `extract_features.py`, `reduce_dimensionality.py`, and `run_classifier.py`, respectively, with the `-e` option.
+
+
+## Running MlFlow
+
+To look at the MlFlow results run the following command. This will host a local server on [http://127.0.0.1:5000](http://127.0.0.1:5000). Opening it displays the results of all previous runs on a web page. The runs can also be exported as csv files.
+
+```
+mlflow ui --backend-store-uri data/classification/mlflow
+```
+
+Mlflow allows us to specify an SQL like search for specific data.
+For example the `params.classifier = "knn"` to search for all entries where a knn classifier was used.
+
+Here is another examples to only display runs with a randomforest classifier on a validation set:
+
+```
+params.classifier = "randomforest" AND params.dataset = "validation"
+```
+
+More information on: [mlflow.org/docs/latest/search-syntax.html#syntax](https://www.mlflow.org/docs/latest/search-syntax.html#syntax)
+
+## Debugging in Visual Studio Code
+
+1. Running a file in debug mode configured as waiting, because otherwise it woulk just finish to quickly
+
+```
+python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\feature_extraction_test.py
+```
+
+2. `launch.json` configuration to attach the editor to the already started debug process.
+
+```json
+"configurations": [
+  {            
+      "name": "Python: Attach",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "localhost",
+        "port": 5678
+      }            
+  },
+]
+```
+
+3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu)
diff --git a/code/application.sh b/code/application.sh
deleted file mode 100755
index da31860e..00000000
--- a/code/application.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-# execute the application with all necessary pickle files
-echo "Starting the application..."
-python -m code.application.application data/preprocessing/pipeline.pickle data/feature_extraction/pipeline.pickle data/dimensionality_reduction/pipeline.pickle data/classification/classifier.pickle
\ No newline at end of file
diff --git a/code/application/application.py b/code/application/application.py
deleted file mode 100644
index 84ecb543..00000000
--- a/code/application/application.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Console-based application for tweet classification.
-
-Created on Wed Sep 29 14:49:25 2021
-
-@author: lbechberger
-"""
-
-import argparse, pickle
-import pandas as pd
-from sklearn.pipeline import make_pipeline
-from code.util import COLUMN_TWEET
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Application")
-parser.add_argument("preprocessing_file", help = "path to the pickle file containing the preprocessing")
-parser.add_argument("feature_file", help = "path to the pickle file containing the feature extraction")
-parser.add_argument("dim_red_file", help = "path to the pickle file containing the dimensionality reduction")
-parser.add_argument("classifier_file", help = "path to the pickle file containing the classifier")
-args = parser.parse_args()
-
-# load all the pipeline steps
-with open(args.preprocessing_file, 'rb') as f_in:
-    preprocessing = pickle.load(f_in)
-with open(args.feature_file, 'rb') as f_in:
-    feature_extraction = pickle.load(f_in)
-with open(args.dim_red_file, 'rb') as f_in:
-    dimensionality_reduction = pickle.load(f_in)
-with open(args.classifier_file, 'rb') as f_in:
-    classifier = pickle.load(f_in)["classifier"]
-
-# chain them together into a single pipeline
-pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier)
-
-# headline output
-print("Welcome to ViralTweeter v0.1!")
-print("-----------------------------")
-print("")
-
-while True:
-    # ask user for input
-    tweet = input("Please type in your tweet (type 'quit' to quit the program): ")
-    
-    # terminate if necessary
-    if tweet == "quit":
-        print("Okay, goodbye!")
-        break
-    
-    # if not terminated: create pandas DataFrame and put it through the pipeline
-    df = pd.DataFrame()
-    df[COLUMN_TWEET] = [tweet]
-    
-    prediction = pipeline.predict(df)
-    confidence = pipeline.predict_proba(df)
-    
-    print("Prediction: {0}, Confidence: {1}".format(prediction, confidence))
-    print("")
-    
diff --git a/code/classification.sh b/code/classification.sh
deleted file mode 100755
index ceb7ac18..00000000
--- a/code/classification.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# create directory if not yet existing
-mkdir -p data/classification/
-
-# run feature extraction on training set (may need to fit extractors)
-echo "  training set"
-python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --knn 5 -s 42 --accuracy --kappa
-
-# run feature extraction on validation set (with pre-fit extractors)
-echo "  validation set"
-python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa
-
-# don't touch the test set, yet, because that would ruin the final generalization experiment!
\ No newline at end of file
diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py
deleted file mode 100644
index 414e0ce5..00000000
--- a/code/classification/run_classifier.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Train or evaluate a single classifier with its given set of hyperparameters.
-
-Created on Wed Sep 29 14:23:48 2021
-
-@author: lbechberger
-"""
-
-import argparse, pickle
-from sklearn.dummy import DummyClassifier
-from sklearn.metrics import accuracy_score, cohen_kappa_score
-from sklearn.preprocessing import StandardScaler
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.pipeline import make_pipeline
-from mlflow import log_metric, log_param, set_tracking_uri
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Classifier")
-parser.add_argument("input_file", help = "path to the input pickle file")
-parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None)
-parser.add_argument("-e", "--export_file", help = "export the trained classifier to the given location", default = None)
-parser.add_argument("-i", "--import_file", help = "import a trained classifier from the given location", default = None)
-parser.add_argument("-m", "--majority", action = "store_true", help = "majority class classifier")
-parser.add_argument("-f", "--frequency", action = "store_true", help = "label frequency classifier")
-parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None)
-parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy")
-parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa")
-parser.add_argument("--log_folder", help = "where to log the mlflow results", default = "data/classification/mlflow")
-args = parser.parse_args()
-
-# load data
-with open(args.input_file, 'rb') as f_in:
-    data = pickle.load(f_in)
-
-set_tracking_uri(args.log_folder)
-
-if args.import_file is not None:
-    # import a pre-trained classifier
-    with open(args.import_file, 'rb') as f_in:
-        input_dict = pickle.load(f_in)
-    
-    classifier = input_dict["classifier"]
-    for param, value in input_dict["params"].items():
-        log_param(param, value)
-    
-    log_param("dataset", "validation")
-
-else:   # manually set up a classifier
-    
-    if args.majority:
-        # majority vote classifier
-        print("    majority vote classifier")
-        log_param("classifier", "majority")
-        params = {"classifier": "majority"}
-        classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed)
-        
-    elif args.frequency:
-        # label frequency classifier
-        print("    label frequency classifier")
-        log_param("classifier", "frequency")
-        params = {"classifier": "frequency"}
-        classifier = DummyClassifier(strategy = "stratified", random_state = args.seed)
-        
-    
-    elif args.knn is not None:
-        print("    {0} nearest neighbor classifier".format(args.knn))
-        log_param("classifier", "knn")
-        log_param("k", args.knn)
-        params = {"classifier": "knn", "k": args.knn}
-        standardizer = StandardScaler()
-        knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1)
-        classifier = make_pipeline(standardizer, knn_classifier)
-    
-    classifier.fit(data["features"], data["labels"].ravel())
-    log_param("dataset", "training")
-
-# now classify the given data
-prediction = classifier.predict(data["features"])
-
-# collect all evaluation metrics
-evaluation_metrics = []
-if args.accuracy:
-    evaluation_metrics.append(("accuracy", accuracy_score))
-if args.kappa:
-    evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score))
-
-# compute and print them
-for metric_name, metric in evaluation_metrics:
-    metric_value = metric(data["labels"], prediction)
-    print("    {0}: {1}".format(metric_name, metric_value))
-    log_metric(metric_name, metric_value)
-    
-# export the trained classifier if the user wants us to do so
-if args.export_file is not None:
-    output_dict = {"classifier": classifier, "params": params}
-    with open(args.export_file, 'wb') as f_out:
-        pickle.dump(output_dict, f_out)
\ No newline at end of file
diff --git a/code/dimensionality_reduction.sh b/code/dimensionality_reduction.sh
deleted file mode 100755
index b82230b5..00000000
--- a/code/dimensionality_reduction.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# create directory if not yet existing
-mkdir -p data/dimensionality_reduction/
-
-# run dimensionality reduction on training set to fit the parameters
-echo "  training set"
-python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/training.pickle data/dimensionality_reduction/training.pickle -e data/dimensionality_reduction/pipeline.pickle -m 1 --verbose
-
-# run feature extraction on validation set and test set (with pre-fit parameters)
-echo "  validation set"
-python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/validation.pickle data/dimensionality_reduction/validation.pickle -i data/dimensionality_reduction/pipeline.pickle
-echo "  test set"
-python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/test.pickle data/dimensionality_reduction/test.pickle -i data/dimensionality_reduction/pipeline.pickle
diff --git a/code/dimensionality_reduction/reduce_dimensionality.py b/code/dimensionality_reduction/reduce_dimensionality.py
deleted file mode 100644
index d2b27419..00000000
--- a/code/dimensionality_reduction/reduce_dimensionality.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Apply a dimensionality reduction technique.
-
-Created on Wed Sep 29 13:33:37 2021
-
-@author: lbechberger
-"""
-
-import argparse, pickle
-from sklearn.feature_selection import SelectKBest, mutual_info_classif
-
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Dimensionality reduction")
-parser.add_argument("input_file", help = "path to the input pickle file")
-parser.add_argument("output_file", help = "path to the output pickle file")
-parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
-parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
-parser.add_argument("-m", "--mutual_information", type = int, help = "select K best features with Mutual Information", default = None)
-parser.add_argument("--verbose", action = "store_true", help = "print information about feature selection process")
-args = parser.parse_args()
-
-# load the data
-with open(args.input_file, 'rb') as f_in:
-    input_data = pickle.load(f_in)
-
-features = input_data["features"]
-labels = input_data["labels"]
-feature_names = input_data["feature_names"]
-
-if args.import_file is not None:
-    # simply import an already fitted dimensionality reducer
-    with open(args.import_file, 'rb') as f_in:
-        dim_red = pickle.load(f_in)
-
-else: # need to set things up manually
-
-    if args.mutual_information is not None:
-        # select K best based on Mutual Information
-        dim_red = SelectKBest(mutual_info_classif, k = args.mutual_information)
-        dim_red.fit(features, labels.ravel())
-        
-        # resulting feature names based on support given by SelectKBest
-        def get_feature_names(kbest, names):
-            support = kbest.get_support()
-            result = []
-            for name, selected in zip(names, support):
-                if selected:
-                    result.append(name)
-            return result
-        
-        if args.verbose:
-            print("    SelectKBest with Mutual Information and k = {0}".format(args.mutual_information))
-            print("    {0}".format(feature_names))
-            print("    " + str(dim_red.scores_))
-            print("    " + str(get_feature_names(dim_red, feature_names)))
-    pass
-
-# apply the dimensionality reduction to the given features
-reduced_features = dim_red.transform(features)
-
-# store the results
-output_data = {"features": reduced_features, 
-               "labels": labels}
-with open(args.output_file, 'wb') as f_out:
-    pickle.dump(output_data, f_out)
-
-# export the dimensionality reduction technique as pickle file if desired by user
-if args.export_file is not None:
-    with open(args.export_file, 'wb') as f_out:
-        pickle.dump(dim_red, f_out)
\ No newline at end of file
diff --git a/code/examples.py b/code/examples.py
deleted file mode 100644
index 69b2b3e3..00000000
--- a/code/examples.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Oct  7 09:20:09 2021
-
-@author: ml
-"""
-
-###############################################################################
-########################    DATA VISUALIZATION   ##############################
-###############################################################################
-
-# plotting with pandas
-import csv
-import pandas as pd
-
-df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
-
-df["language"].value_counts().plot(kind = 'bar')
-df["language"].value_counts().plot(kind = 'bar', logy = True)
-
-df["date"] = df["date"].astype("datetime64")
-df["label"].groupby(df["date"].dt.month).count().plot(kind = 'bar')
-
-
-# plotting with matplotlib
-import pickle
-from matplotlib import pyplot as plt
-import numpy as np
-
-with open("data/feature_extraction/training.pickle", "rb") as f_in:
-    data = pickle.load(f_in)
-
-features = data["features"]
-labels = data["labels"]
-
-plt.hist(features)
-plt.hist(features, range = [0,400])
-
-pos = features[labels]
-neg_index = np.array([not x for x in labels])
-neg = features[neg_index]
-
-bins = [0, 50, 100, 150, 200, 250, 300, 350, 400]
-
-plt.hist(pos, bins = bins)
-plt.hist(neg, bins = bins)
-
-###############################################################################
-########################    FEATURE EXTRACTION   ##############################
-###############################################################################
-
-# bigrams
-import nltk
-import string
-
-text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House."
-tokens = nltk.word_tokenize(text)
-tokens = [token for token in tokens if token not in string.punctuation]
-
-bigrams = nltk.bigrams(tokens)
-freq_dist = nltk.FreqDist(bigrams)
-freq_list = []
-for bigram, freq in freq_dist.items():
-    freq_list.append([bigram, freq])
-freq_list.sort(key = lambda x: x[1], reverse = True)
-for i in range(len(freq_list)):
-    print(freq_list[i])
-
-
-# tf-idf
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-
-tweets = df["tweet"][:100]
-vectorizer = TfidfVectorizer()
-tf_idf_vectors = vectorizer.fit_transform(tweets).todense()
-
-print(tf_idf_vectors.shape)
-print(vectorizer.get_feature_names()[142:145])
-print(tf_idf_vectors[66:71, 142:145])
-
-tf_idf_similarities = cosine_similarity(tf_idf_vectors)
-print(tf_idf_similarities[:5,:5])
-
-
-# NER
-text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House."
-sentences = nltk.sent_tokenize(text)
-for sentence in sentences:
-    words = nltk.word_tokenize(sentence)
-    pos_tagged = nltk.pos_tag(words)
-    ne_chunked = nltk.ne_chunk(pos_tagged)
-    print(ne_chunked)
-
-
-# WordNet
-dog_synsets = nltk.corpus.wordnet.synsets('dog')
-for syn in dog_synsets:
-    words = [str(lemma.name()) for lemma in syn.lemmas()]
-    print(syn, words, syn.definition(), syn.hypernyms())
-    print("")
-
-
-# word2vec
-import gensim.downloader as api
-
-embeddings = api.load('word2vec-google-news-300')
-pairs = [('car', 'minivan'), ('car', 'airplane'), ('car', 'cereal')]
-
-for w1, w2 in pairs:
-    print("{0} - {1}: {2}".format(w1, w2, embeddings.similarity(w1, w2)))
-
-dog_vector = embeddings['dog']
-
-
-# one hot encoding
-from sklearn.preprocessing import OneHotEncoder
-import numpy as np
-
-features = np.array([["morning"], ["afternoon"], ["evening"], ["night"], ["afternoon"]])
-encoder = OneHotEncoder(sparse = False)
-encoder.fit(features)
-encoder.transform(features)
-
-
-###############################################################################
-#####################    DIMENSIONALITY REDUCTION   ###########################
-###############################################################################
-
-from sklearn.datasets import load_breast_cancer
-from sklearn.decomposition import PCA
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif, SelectFromModel
-from sklearn.ensemble import RandomForestClassifier
-import numpy as np
-
-data_set = load_breast_cancer()
-X = data_set.data
-y = data_set.target
-print("Data Set: ", X.shape, y.shape)
-print("Combinatorics of binary feature values:", 2**30)
-
-
-# PCA
-print("\nPCA")
-print('---')
-pca = PCA(random_state = 42)
-pca.fit(X)
-print("explained variance (percentage): ", pca.explained_variance_ratio_)
-print('most important component: ', pca.components_[0])
-pca_transformed = pca.transform(X)
-pca_transformed = pca_transformed[:,0:1]
-print("after transformation: ", pca_transformed.shape, y.shape)
-print("Compare: ", X[0], pca_transformed[0])
-
-
-# wrapper
-print("\nWrapper")
-print("-------")
-
-model = LogisticRegression(random_state = 42, max_iter = 10000)
-rfe = RFE(model, n_features_to_select = 2)
-rfe.fit(X,y)
-print("Feature ranking according to RFE/LogReg:", rfe.ranking_)
-index_of_first = np.where(rfe.ranking_ == 1)[0][0]
-index_of_second = np.where(rfe.ranking_ == 2)[0][0]
-print("Two most promising features: ", index_of_first, index_of_second)
-wrapper_transformed = rfe.transform(X)
-print("After transformation: ", wrapper_transformed.shape, y.shape)
-print("compare: ", X[0], wrapper_transformed[0])
-
-
-# Filter
-print("\n Filter")
-print("------")
-skb = SelectKBest(score_func = mutual_info_classif, k = 3)
-skb.fit(X,y)
-print("Feature scores according to MI: ", skb.scores_)
-filter_transformed  = skb.transform(X)
-print("After transformation: ", filter_transformed.shape, y.shape)
-print("Compare: ", X[0], filter_transformed[0])
-
-
-
-# Embedded
-print("\nEmbedded")
-print("--------")
-rf = RandomForestClassifier(n_estimators = 10, random_state=42)
-rf.fit(X,y)
-print("Feature imporance according to RF: ", rf.feature_importances_)
-sfm = SelectFromModel(rf, threshold = 0.1, prefit = True)
-embedded_transformed = sfm.transform(X)
-print("After transformation: ", embedded_transformed.shape, y.shape)
-print("Compare: ", X[0], embedded_transformed[0])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/code/feature_extraction.sh b/code/feature_extraction.sh
deleted file mode 100755
index f494f835..00000000
--- a/code/feature_extraction.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# create directory if not yet existing
-mkdir -p data/feature_extraction/
-
-# run feature extraction on training set (may need to fit extractors)
-echo "  training set"
-python -m code.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length
-
-# run feature extraction on validation set and test set (with pre-fit extractors)
-echo "  validation set"
-python -m code.feature_extraction.extract_features data/preprocessing/split/validation.csv data/feature_extraction/validation.pickle -i data/feature_extraction/pipeline.pickle
-echo "  test set"
-python -m code.feature_extraction.extract_features data/preprocessing/split/test.csv data/feature_extraction/test.pickle -i data/feature_extraction/pipeline.pickle
\ No newline at end of file
diff --git a/code/feature_extraction/extract_features.py b/code/feature_extraction/extract_features.py
deleted file mode 100644
index a3527acf..00000000
--- a/code/feature_extraction/extract_features.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Runs the specified collection of feature extractors.
-
-Created on Wed Sep 29 11:00:24 2021
-
-@author: lbechberger
-"""
-
-import argparse, csv, pickle
-import pandas as pd
-import numpy as np
-from code.feature_extraction.character_length import CharacterLength
-from code.feature_extraction.feature_collector import FeatureCollector
-from code.util import COLUMN_TWEET, COLUMN_LABEL
-
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Feature Extraction")
-parser.add_argument("input_file", help = "path to the input csv file")
-parser.add_argument("output_file", help = "path to the output pickle file")
-parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
-parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
-parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet")
-args = parser.parse_args()
-
-# load data
-df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
-
-if args.import_file is not None:
-    # simply import an exisiting FeatureCollector
-    with open(args.import_file, "rb") as f_in:
-        feature_collector = pickle.load(f_in)
-
-else:    # need to create FeatureCollector manually
-
-    # collect all feature extractors
-    features = []
-    if args.char_length:
-        # character length of original tweet (without any changes)
-        features.append(CharacterLength(COLUMN_TWEET))
-    
-    # create overall FeatureCollector
-    feature_collector = FeatureCollector(features)
-    
-    # fit it on the given data set (assumed to be training data)
-    feature_collector.fit(df)
-
-
-# apply the given FeatureCollector on the current data set
-# maps the pandas DataFrame to an numpy array
-feature_array = feature_collector.transform(df)
-
-# get label array
-label_array = np.array(df[COLUMN_LABEL])
-label_array = label_array.reshape(-1, 1)
-
-# store the results
-results = {"features": feature_array, "labels": label_array, 
-           "feature_names": feature_collector.get_feature_names()}
-with open(args.output_file, 'wb') as f_out:
-    pickle.dump(results, f_out)
-
-# export the FeatureCollector as pickle file if desired by user
-if args.export_file is not None:
-    with open(args.export_file, 'wb') as f_out:
-        pickle.dump(feature_collector, f_out)
\ No newline at end of file
diff --git a/code/preprocessing.sh b/code/preprocessing.sh
deleted file mode 100755
index 61f83ea6..00000000
--- a/code/preprocessing.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# create directory if not yet existing
-mkdir -p data/preprocessing/split/
-
-# install all NLTK models
-python -m nltk.downloader all
-
-# add labels
-echo "  creating labels"
-python -m code.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv
-
-# other preprocessing (removing punctuation etc.)
-echo "  general preprocessing"
-python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize -e data/preprocessing/pipeline.pickle
-
-# split the data set
-echo "  splitting the data set"
-python -m code.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42
\ No newline at end of file
diff --git a/code/preprocessing/create_labels.py b/code/preprocessing/create_labels.py
deleted file mode 100644
index 21b1748d..00000000
--- a/code/preprocessing/create_labels.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Reads in the original csv files and creates labels for the data points.
-Stores the result as a single pandas DataFrame in a pickle file.
-
-Created on Tue Sep 28 15:55:44 2021
-
-@author: lbechberger
-"""
-
-import os, argparse, csv
-import pandas as pd
-from code.util import COLUMN_LIKES, COLUMN_RETWEETS, COLUMN_LABEL
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Creation of Labels")
-parser.add_argument("data_directory", help = "directory where the original csv files reside")
-parser.add_argument("output_file", help = "path to the output csv file")
-parser.add_argument("-l", '--likes_weight', type = int, help = "weight of likes", default = 1)
-parser.add_argument("-r", '--retweet_weight', type = int, help = "weight of retweets", default = 1)
-parser.add_argument("-t", '--threshold', type = int, help = "threshold to surpass for positive class", default = 50)
-args = parser.parse_args()
-
-# get all csv files in data_directory
-file_paths = [args.data_directory + f for f in os.listdir(args.data_directory) if f.endswith(".csv")]
-
-# load all csv files
-dfs = []
-for file_path in file_paths:
-    dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n"))
-
-# join all data into a single DataFrame
-df = pd.concat(dfs)
-
-# compute new column "label" based on likes and retweets
-df[COLUMN_LABEL] = (args.likes_weight * df[COLUMN_LIKES] + args.retweet_weight * df[COLUMN_RETWEETS]) > args.threshold
-
-# print statistics
-print("Number of tweets: {0}".format(len(df)))
-print("Label distribution:")
-print(df[COLUMN_LABEL].value_counts(normalize = True))
-
-# store the DataFrame into a csv file
-df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
\ No newline at end of file
diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py
deleted file mode 100644
index 72130a30..00000000
--- a/code/preprocessing/run_preprocessing.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Runs the specified collection of preprocessing steps
-
-Created on Tue Sep 28 16:43:18 2021
-
-@author: lbechberger
-"""
-
-import argparse, csv, pickle
-import pandas as pd
-from sklearn.pipeline import make_pipeline
-from code.preprocessing.punctuation_remover import PunctuationRemover
-from code.preprocessing.tokenizer import Tokenizer
-from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Various preprocessing steps")
-parser.add_argument("input_file", help = "path to the input csv file")
-parser.add_argument("output_file", help = "path to the output csv file")
-parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation")
-parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words")
-parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET)
-parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
-args = parser.parse_args()
-
-# load data
-df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
-
-# collect all preprocessors
-preprocessors = []
-if args.punctuation:
-    preprocessors.append(PunctuationRemover())
-if args.tokenize:
-    preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED))
-
-# call all preprocessing steps
-for preprocessor in preprocessors:
-    df = preprocessor.fit_transform(df)
-
-# store the results
-df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
-
-# create a pipeline if necessary and store it as pickle file
-if args.export_file is not None:
-    pipeline = make_pipeline(*preprocessors)
-    with open(args.export_file, 'wb') as f_out:
-        pickle.dump(pipeline, f_out)
\ No newline at end of file
diff --git a/code/preprocessing/split_data.py b/code/preprocessing/split_data.py
deleted file mode 100644
index 57bad668..00000000
--- a/code/preprocessing/split_data.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Splits the preprocessed data into training, validation, and test set.
-
-Created on Tue Sep 28 16:45:51 2021
-
-@author: lbechberger
-"""
-
-import os, argparse, csv
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from code.util import COLUMN_LABEL
-
-# setting up CLI
-parser = argparse.ArgumentParser(description = "Splitting the data set")
-parser.add_argument("input_file", help = "path to the input csv file")
-parser.add_argument("output_folder", help = "path to the output folder")
-parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None)
-parser.add_argument("-t", '--test_size', type = float, help = "relative size of the test set", default = 0.2)
-parser.add_argument("-v", '--validation_size', type = float, help = "relative size of the validation set", default = 0.2)
-args = parser.parse_args()
-
-# load the data
-df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
-
-# split into (training & validation) and test set
-X, X_test = train_test_split(df, test_size = args.test_size, random_state = args.seed, shuffle = True, stratify = df[COLUMN_LABEL])
-
-# split remainder into training and validation
-relative_validation_size = args.validation_size / (1 - args.test_size)
-X_train, X_val = train_test_split(X, test_size = relative_validation_size, random_state = args.seed, shuffle = True, stratify = X[COLUMN_LABEL])
-
-# store the three data sets separately
-X_train.to_csv(os.path.join(args.output_folder, "training.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
-X_val.to_csv(os.path.join(args.output_folder, "validation.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
-X_test.to_csv(os.path.join(args.output_folder, "test.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
-
-print("Training: {0} examples, Validation: {1} examples, Test: {2} examples".format(len(X_train), len(X_val), len(X_test)))
\ No newline at end of file
diff --git a/code/util.py b/code/util.py
deleted file mode 100644
index 7d8794c7..00000000
--- a/code/util.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Utility file for collecting frequently used constants and helper functions.
-
-Created on Wed Sep 29 10:50:36 2021
-
-@author: lbechberger
-"""
-
-# column names for the original data frame
-COLUMN_TWEET = "tweet"
-COLUMN_LIKES = "likes_count"
-COLUMN_RETWEETS = "retweets_count"
-
-# column names of novel columns for preprocessing
-COLUMN_LABEL = "label"
-COLUMN_PUNCTUATION = "tweet_no_punctuation"
-
-SUFFIX_TOKENIZED = "_tokenized"
\ No newline at end of file
diff --git a/data/classification/classifier.pickle b/data/classification/classifier.pickle
index 012911f3..16ffff56 100644
Binary files a/data/classification/classifier.pickle and b/data/classification/classifier.pickle differ
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/meta.yaml b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/meta.yaml
new file mode 100644
index 00000000..baa1b47f
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/artifacts
+end_time: 1635977319534
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 255fa50f8e7d4aa588edea084b3adf15
+run_uuid: 255fa50f8e7d4aa588edea084b3adf15
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635977304454
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Accuracy b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Accuracy
new file mode 100644
index 00000000..70668c79
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Accuracy
@@ -0,0 +1 @@
+1635977319507 0.9269994821823659 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Cohen_kappa b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Cohen_kappa
new file mode 100644
index 00000000..f9595b70
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635977319508 0.5834040363937303 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/F1-Score b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/F1-Score
new file mode 100644
index 00000000..85ebf8a0
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/F1-Score
@@ -0,0 +1 @@
+1635977319512 0.6237869966035905 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Jaccard b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Jaccard
new file mode 100644
index 00000000..3d0685ca
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Jaccard
@@ -0,0 +1 @@
+1635977319513 0.4532634083998061 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Precision b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Precision
new file mode 100644
index 00000000..4855003d
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Precision
@@ -0,0 +1 @@
+1635977319509 0.6058910162002945 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Recall b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Recall
new file mode 100644
index 00000000..ddcddaf8
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/metrics/Recall
@@ -0,0 +1 @@
+1635977319510 0.6427723267295794 0
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/classifier b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/dataset b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/k b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.runName b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.runName
new file mode 100644
index 00000000..de9cf792
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.runName
@@ -0,0 +1 @@
+after sentiment was added
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.git.commit b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..75d377e6
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a73e7450c940376e9373c487fc896f31d231c45b
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.name b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.type b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.user b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/255fa50f8e7d4aa588edea084b3adf15/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml
new file mode 100644
index 00000000..1cb5f70d
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/artifacts
+end_time: 1634906752479
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 29048f3f5892425cb3622a4ad04a8c0b
+run_uuid: 29048f3f5892425cb3622a4ad04a8c0b
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906752379
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy
new file mode 100644
index 00000000..36a0a585
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy
@@ -0,0 +1 @@
+1634906752471 0.8315562773619545 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa
new file mode 100644
index 00000000..a24b8a34
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906752473 0.010411614999104146 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score
new file mode 100644
index 00000000..352b4320
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score
@@ -0,0 +1 @@
+1634906752476 0.10336402931779741 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard
new file mode 100644
index 00000000..f3be7376
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard
@@ -0,0 +1 @@
+1634906752478 0.05449861276258423 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision
new file mode 100644
index 00000000..2b5eff49
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision
@@ -0,0 +1 @@
+1634906752474 0.10361718161266013 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall
new file mode 100644
index 00000000..bcc48219
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall
@@ -0,0 +1 @@
+1634906752475 0.10311211098612673 0
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier
new file mode 100644
index 00000000..b11cc475
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier
@@ -0,0 +1 @@
+stratified
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/meta.yaml b/data/classification/mlflow/0/3485614b482945909e5f785567615928/meta.yaml
new file mode 100644
index 00000000..f02d35a4
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/3485614b482945909e5f785567615928/artifacts
+end_time: 1635977245122
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 3485614b482945909e5f785567615928
+run_uuid: 3485614b482945909e5f785567615928
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635977244890
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Accuracy b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Accuracy
new file mode 100644
index 00000000..a68a34dc
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Accuracy
@@ -0,0 +1 @@
+1635977245115 0.8940474509250106 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Cohen_kappa b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Cohen_kappa
new file mode 100644
index 00000000..c3faddbe
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635977245117 0.12403091652048526 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/F1-Score b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/F1-Score
new file mode 100644
index 00000000..10b205b1
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/F1-Score
@@ -0,0 +1 @@
+1635977245120 0.16685174902831762 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Jaccard b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Jaccard
new file mode 100644
index 00000000..22c977d8
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Jaccard
@@ -0,0 +1 @@
+1635977245121 0.09101923368165986 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Precision b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Precision
new file mode 100644
index 00000000..0f8267b5
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Precision
@@ -0,0 +1 @@
+1635977245118 0.3213903743315508 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Recall b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Recall
new file mode 100644
index 00000000..2fb846d3
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/metrics/Recall
@@ -0,0 +1 @@
+1635977245119 0.11267341582302212 0
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/classifier b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/dataset b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/n b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.runName b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.runName
new file mode 100644
index 00000000..e69de29b
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.git.commit b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..75d377e6
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a73e7450c940376e9373c487fc896f31d231c45b
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.name b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.type b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.user b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/3485614b482945909e5f785567615928/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/meta.yaml b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/meta.yaml
new file mode 100644
index 00000000..3b0da8db
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/artifacts
+end_time: 1635982018112
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 3e9fe1f803af400882416350afe9634c
+run_uuid: 3e9fe1f803af400882416350afe9634c
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635982003505
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Accuracy b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Accuracy
new file mode 100644
index 00000000..47203ec0
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Accuracy
@@ -0,0 +1 @@
+1635982018105 0.8622546250529586 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Cohen_kappa b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Cohen_kappa
new file mode 100644
index 00000000..ba357e79
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635982018106 0.19408522052439148 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/F1-Score b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/F1-Score
new file mode 100644
index 00000000..05ed7934
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/F1-Score
@@ -0,0 +1 @@
+1635982018110 0.2701337573660088 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Jaccard b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Jaccard
new file mode 100644
index 00000000..ce6cf7aa
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Jaccard
@@ -0,0 +1 @@
+1635982018111 0.15615875419054828 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Precision b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Precision
new file mode 100644
index 00000000..7e5c5058
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Precision
@@ -0,0 +1 @@
+1635982018107 0.26955385476946053 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Recall b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Recall
new file mode 100644
index 00000000..616c3d2e
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/metrics/Recall
@@ -0,0 +1 @@
+1635982018109 0.27071616047994 0
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/classifier b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/dataset b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/k b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.runName b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.runName
new file mode 100644
index 00000000..e69de29b
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.git.commit b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..8f7c99c1
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+80b9599de2472c1e5df28b9c6711716018b2039f
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.name b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.type b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.user b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/3e9fe1f803af400882416350afe9634c/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml
new file mode 100644
index 00000000..3c08d2d6
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/artifacts
+end_time: 1634906750278
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 4d5b6cef36004ae1af5f9aad10adf64a
+run_uuid: 4d5b6cef36004ae1af5f9aad10adf64a
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906750021
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy
new file mode 100644
index 00000000..b42c9de2
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy
@@ -0,0 +1 @@
+1634906750272 0.8298910229251989 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa
new file mode 100644
index 00000000..9a759866
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906750274 -0.0008600519779315974 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score
new file mode 100644
index 00000000..2de8d9b5
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score
@@ -0,0 +1 @@
+1634906750276 0.09299407021617041 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard
new file mode 100644
index 00000000..70b292b3
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard
@@ -0,0 +1 @@
+1634906750277 0.0487644368398539 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision
new file mode 100644
index 00000000..8eb2df09
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision
@@ -0,0 +1 @@
+1634906750274 0.09337197580645161 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall
new file mode 100644
index 00000000..8f2f77b3
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall
@@ -0,0 +1 @@
+1634906750275 0.09261921129929379 0
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier
new file mode 100644
index 00000000..b11cc475
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier
@@ -0,0 +1 @@
+stratified
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml
new file mode 100644
index 00000000..31b2fc75
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/artifacts
+end_time: 1634906743997
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 568feaea689947798516e2a96b7edc58
+run_uuid: 568feaea689947798516e2a96b7edc58
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906743903
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy
new file mode 100644
index 00000000..00dbc64d
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy
@@ -0,0 +1 @@
+1634906743991 0.9058395706821071 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa
new file mode 100644
index 00000000..9b8c5db1
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906743992 0.0 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score
new file mode 100644
index 00000000..37842d2e
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score
@@ -0,0 +1 @@
+1634906743995 0.0 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard
new file mode 100644
index 00000000..37842d2e
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard
@@ -0,0 +1 @@
+1634906743995 0.0 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision
new file mode 100644
index 00000000..26d795f4
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision
@@ -0,0 +1 @@
+1634906743993 0.0 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall
new file mode 100644
index 00000000..52fb372f
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall
@@ -0,0 +1 @@
+1634906743994 0.0 0
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier
new file mode 100644
index 00000000..ede38720
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier
@@ -0,0 +1 @@
+most_frequent
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml
new file mode 100644
index 00000000..61a3f71b
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/artifacts
+end_time: 1634906771848
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 62af76001a3e4770beda60181362e4e5
+run_uuid: 62af76001a3e4770beda60181362e4e5
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906771681
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy
new file mode 100644
index 00000000..71645d47
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy
@@ -0,0 +1 @@
+1634906771842 0.9045509108882926 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa
new file mode 100644
index 00000000..17ce5fa1
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906771843 0.04186413205264283 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score
new file mode 100644
index 00000000..ce0c4be1
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score
@@ -0,0 +1 @@
+1634906771846 0.05389326334208224 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard
new file mode 100644
index 00000000..3c075306
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard
@@ -0,0 +1 @@
+1634906771847 0.0276928609962237 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision
new file mode 100644
index 00000000..ad0a2c0e
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision
@@ -0,0 +1 @@
+1634906771844 0.4041994750656168 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall
new file mode 100644
index 00000000..cb94e5e5
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall
@@ -0,0 +1 @@
+1634906771845 0.028871391076115485 0
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml
new file mode 100644
index 00000000..8e453c52
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/artifacts
+end_time: 1634906791009
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 73628a4a7c194985bf8ad402d54d9e11
+run_uuid: 73628a4a7c194985bf8ad402d54d9e11
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906778765
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy
new file mode 100644
index 00000000..e796f24b
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy
@@ -0,0 +1 @@
+1634906790986 0.8436602645577367 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa
new file mode 100644
index 00000000..e67e8622
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906790989 0.0993382191603408 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score
new file mode 100644
index 00000000..592a11f3
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score
@@ -0,0 +1 @@
+1634906790993 0.18577426373693726 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard
new file mode 100644
index 00000000..088fbcdc
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard
@@ -0,0 +1 @@
+1634906790993 0.10239864864864864 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision
new file mode 100644
index 00000000..9eda6a53
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision
@@ -0,0 +1 @@
+1634906790990 0.18226097414311485 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall
new file mode 100644
index 00000000..5014ca2c
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall
@@ -0,0 +1 @@
+1634906790991 0.18942566089619398 0
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/meta.yaml b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/meta.yaml
new file mode 100644
index 00000000..005c2cc0
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/artifacts
+end_time: 1635981911880
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: 98d5031f184b4f5f817a98f46f30a12d
+run_uuid: 98d5031f184b4f5f817a98f46f30a12d
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635981911634
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Accuracy b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Accuracy
new file mode 100644
index 00000000..d33a6031
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Accuracy
@@ -0,0 +1 @@
+1635981911872 0.8986018923880807 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Cohen_kappa b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Cohen_kappa
new file mode 100644
index 00000000..66fb2022
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635981911874 0.20518464890935928 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/F1-Score b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/F1-Score
new file mode 100644
index 00000000..a8a4b844
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/F1-Score
@@ -0,0 +1 @@
+1635981911878 0.25071745369162535 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Jaccard b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Jaccard
new file mode 100644
index 00000000..790edc67
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Jaccard
@@ -0,0 +1 @@
+1635981911879 0.14332587621178225 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Precision b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Precision
new file mode 100644
index 00000000..ce7024f9
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Precision
@@ -0,0 +1 @@
+1635981911875 0.4120926243567753 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Recall b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Recall
new file mode 100644
index 00000000..c427459b
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/metrics/Recall
@@ -0,0 +1 @@
+1635981911876 0.1801649793775778 0
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/classifier b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/dataset b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/n b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.runName b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.runName
new file mode 100644
index 00000000..e69de29b
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.git.commit b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..8f7c99c1
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+80b9599de2472c1e5df28b9c6711716018b2039f
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.name b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.type b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.user b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/98d5031f184b4f5f817a98f46f30a12d/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/meta.yaml b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/meta.yaml
new file mode 100644
index 00000000..f7e25c2f
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/artifacts
+end_time: 1635981909453
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: a78d758e988e49728f9fe39f81159b21
+run_uuid: a78d758e988e49728f9fe39f81159b21
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635981906963
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Accuracy b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Accuracy
new file mode 100644
index 00000000..041385a4
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Accuracy
@@ -0,0 +1 @@
+1635981909392 0.9706197335592901 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Cohen_kappa b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Cohen_kappa
new file mode 100644
index 00000000..5363bda7
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635981909394 0.8108972187811726 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/F1-Score b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/F1-Score
new file mode 100644
index 00000000..e1025d18
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/F1-Score
@@ -0,0 +1 @@
+1635981909397 0.8267342193843912 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Jaccard b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Jaccard
new file mode 100644
index 00000000..4107ceda
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Jaccard
@@ -0,0 +1 @@
+1635981909398 0.7046435965690624 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Precision b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Precision
new file mode 100644
index 00000000..b0109689
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Precision
@@ -0,0 +1 @@
+1635981909395 0.9294631710362048 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Recall b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Recall
new file mode 100644
index 00000000..acc43d05
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/metrics/Recall
@@ -0,0 +1 @@
+1635981909396 0.7444534716580213 0
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/classifier b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/dataset b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/n b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.runName b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.runName
new file mode 100644
index 00000000..7e77f9fb
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.runName
@@ -0,0 +1 @@
+after more count features
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.git.commit b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..8f7c99c1
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+80b9599de2472c1e5df28b9c6711716018b2039f
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.name b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.type b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.user b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/a78d758e988e49728f9fe39f81159b21/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml
new file mode 100644
index 00000000..89d99cca
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/artifacts
+end_time: 1634906769598
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: a7e9e6e2984448b39f2b82f11b4ed46c
+run_uuid: a7e9e6e2984448b39f2b82f11b4ed46c
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906768317
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy
new file mode 100644
index 00000000..98abeb7c
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy
@@ -0,0 +1 @@
+1634906769587 0.908393353104552 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa
new file mode 100644
index 00000000..c2a029b6
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906769588 0.08152669872440343 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score
new file mode 100644
index 00000000..6e983bad
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score
@@ -0,0 +1 @@
+1634906769591 0.09330227140361096 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard
new file mode 100644
index 00000000..79473c13
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard
@@ -0,0 +1 @@
+1634906769592 0.048933960535157923 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision
new file mode 100644
index 00000000..e69642af
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision
@@ -0,0 +1 @@
+1634906769589 0.6852010265183918 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall
new file mode 100644
index 00000000..3f9ff12a
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall
@@ -0,0 +1 @@
+1634906769590 0.05005937128929442 0
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml
new file mode 100644
index 00000000..76595577
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/artifacts
+end_time: 1634906741758
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: af59d8fd6467448887013f0561655ca2
+run_uuid: af59d8fd6467448887013f0561655ca2
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906741516
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy
new file mode 100644
index 00000000..8e439910
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy
@@ -0,0 +1 @@
+1634906741751 0.905845454973403 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa
new file mode 100644
index 00000000..4b5cb0b2
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906741752 0.0 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score
new file mode 100644
index 00000000..a32d8f40
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score
@@ -0,0 +1 @@
+1634906741755 0.0 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard
new file mode 100644
index 00000000..e3a1e26b
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard
@@ -0,0 +1 @@
+1634906741756 0.0 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision
new file mode 100644
index 00000000..07a50e00
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision
@@ -0,0 +1 @@
+1634906741753 0.0 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall
new file mode 100644
index 00000000..e5d55367
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall
@@ -0,0 +1 @@
+1634906741754 0.0 0
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier
new file mode 100644
index 00000000..ede38720
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier
@@ -0,0 +1 @@
+most_frequent
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml
new file mode 100644
index 00000000..60c7576f
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/artifacts
+end_time: 1634906794982
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: b0bb43858340487191c30da4b1a7218e
+run_uuid: b0bb43858340487191c30da4b1a7218e
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1634906793160
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy
new file mode 100644
index 00000000..77dc0de8
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy
@@ -0,0 +1 @@
+1634906794975 0.8351045050134162 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa
new file mode 100644
index 00000000..7b368705
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1634906794977 0.06410899327093145 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score
new file mode 100644
index 00000000..68123d0f
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score
@@ -0,0 +1 @@
+1634906794979 0.15534858486300748 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard
new file mode 100644
index 00000000..535ef0da
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard
@@ -0,0 +1 @@
+1634906794980 0.0842156862745098 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision
new file mode 100644
index 00000000..7aa4cad5
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision
@@ -0,0 +1 @@
+1634906794978 0.15004366812227074 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall
new file mode 100644
index 00000000..df747ad3
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall
@@ -0,0 +1 @@
+1634906794979 0.16104236970378702 0
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..73f681b9
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a07f531063b7ce83182c0226a382000c0df50b8d
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/meta.yaml b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/meta.yaml
new file mode 100644
index 00000000..bed582b8
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/artifacts
+end_time: 1635982001224
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: c0c26bf895da404db4d0a18842fdc665
+run_uuid: c0c26bf895da404db4d0a18842fdc665
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635981927774
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Accuracy b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Accuracy
new file mode 100644
index 00000000..b2cc30aa
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Accuracy
@@ -0,0 +1 @@
+1635982001160 0.976168620251377 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Cohen_kappa b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Cohen_kappa
new file mode 100644
index 00000000..715aca9b
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635982001161 0.8593142653058983 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/F1-Score b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/F1-Score
new file mode 100644
index 00000000..5a86ddfa
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/F1-Score
@@ -0,0 +1 @@
+1635982001165 0.8724570132896643 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Jaccard b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Jaccard
new file mode 100644
index 00000000..f62e2e78
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Jaccard
@@ -0,0 +1 @@
+1635982001166 0.7737682940453581 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Precision b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Precision
new file mode 100644
index 00000000..adef027f
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Precision
@@ -0,0 +1 @@
+1635982001163 0.8793245730971878 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Recall b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Recall
new file mode 100644
index 00000000..ad2ddd42
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/metrics/Recall
@@ -0,0 +1 @@
+1635982001164 0.8656958940066246 0
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/classifier b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/dataset b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/k b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.runName b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.runName
new file mode 100644
index 00000000..7e77f9fb
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.runName
@@ -0,0 +1 @@
+after more count features
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.git.commit b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..8f7c99c1
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+80b9599de2472c1e5df28b9c6711716018b2039f
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.name b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.type b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.user b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/c0c26bf895da404db4d0a18842fdc665/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/meta.yaml b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/meta.yaml
new file mode 100644
index 00000000..b2925baf
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/artifacts
+end_time: 1635977323897
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: c5fad049ce6e46af9c9d4cf00a5b7c23
+run_uuid: c5fad049ce6e46af9c9d4cf00a5b7c23
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635977321970
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Accuracy b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Accuracy
new file mode 100644
index 00000000..a7164f8b
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Accuracy
@@ -0,0 +1 @@
+1635977323890 0.8449371557689592 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Cohen_kappa b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Cohen_kappa
new file mode 100644
index 00000000..f4e085cf
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635977323892 0.11291814833336233 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/F1-Score b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/F1-Score
new file mode 100644
index 00000000..e6cfb735
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/F1-Score
@@ -0,0 +1 @@
+1635977323895 0.1986863711001642 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Jaccard b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Jaccard
new file mode 100644
index 00000000..ecb3a8a0
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Jaccard
@@ -0,0 +1 @@
+1635977323896 0.11030082041932543 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Precision b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Precision
new file mode 100644
index 00000000..aa689d14
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Precision
@@ -0,0 +1 @@
+1635977323893 0.19349680170575692 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Recall b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Recall
new file mode 100644
index 00000000..b68b0167
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/metrics/Recall
@@ -0,0 +1 @@
+1635977323894 0.20416197975253092 0
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/classifier b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/classifier
new file mode 100644
index 00000000..eecfc333
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/classifier
@@ -0,0 +1 @@
+knn
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/dataset b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/dataset
new file mode 100644
index 00000000..efc02160
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/dataset
@@ -0,0 +1 @@
+validation
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/k b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/k
new file mode 100644
index 00000000..56a6051c
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/params/k
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.runName b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.runName
new file mode 100644
index 00000000..e69de29b
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.git.commit b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..75d377e6
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a73e7450c940376e9373c487fc896f31d231c45b
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.name b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.type b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.user b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/c5fad049ce6e46af9c9d4cf00a5b7c23/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/meta.yaml b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/meta.yaml
new file mode 100644
index 00000000..79793223
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/artifacts
+end_time: 1635977242700
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+name: ''
+run_id: dbba7c73f7924cea9e0e077e014466e1
+run_uuid: dbba7c73f7924cea9e0e077e014466e1
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1635977240569
+status: 3
+tags: []
+user_id: Krext
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Accuracy b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Accuracy
new file mode 100644
index 00000000..3ed94bf6
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Accuracy
@@ -0,0 +1 @@
+1635977242656 0.9464882549545732 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Cohen_kappa b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Cohen_kappa
new file mode 100644
index 00000000..43c6a9b7
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Cohen_kappa
@@ -0,0 +1 @@
+1635977242658 0.6043889364913619 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/F1-Score b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/F1-Score
new file mode 100644
index 00000000..fddeed13
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/F1-Score
@@ -0,0 +1 @@
+1635977242661 0.6304453836150845 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Jaccard b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Jaccard
new file mode 100644
index 00000000..e7feb776
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Jaccard
@@ -0,0 +1 @@
+1635977242662 0.46032876387158034 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Precision b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Precision
new file mode 100644
index 00000000..f73474b8
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Precision
@@ -0,0 +1 @@
+1635977242659 0.901243174160567 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Recall b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Recall
new file mode 100644
index 00000000..42a9149a
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/metrics/Recall
@@ -0,0 +1 @@
+1635977242660 0.4847822011124305 0
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/classifier b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/classifier
new file mode 100644
index 00000000..f5035153
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/classifier
@@ -0,0 +1 @@
+randomforest
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/dataset b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/dataset
new file mode 100644
index 00000000..ce15c0a9
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/dataset
@@ -0,0 +1 @@
+training
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/n b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/n
new file mode 100644
index 00000000..9a037142
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/params/n
@@ -0,0 +1 @@
+10
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.runName b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.runName
new file mode 100644
index 00000000..de9cf792
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.runName
@@ -0,0 +1 @@
+after sentiment was added
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.git.commit b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.git.commit
new file mode 100644
index 00000000..75d377e6
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+a73e7450c940376e9373c487fc896f31d231c45b
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.name b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.name
new file mode 100644
index 00000000..a50988a9
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.name
@@ -0,0 +1 @@
+E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.type b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.type
new file mode 100644
index 00000000..0c2c1fe9
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
\ No newline at end of file
diff --git a/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.user b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.user
new file mode 100644
index 00000000..d10f720c
--- /dev/null
+++ b/data/classification/mlflow/0/dbba7c73f7924cea9e0e077e014466e1/tags/mlflow.user
@@ -0,0 +1 @@
+Krext
\ No newline at end of file
diff --git a/data/classification/mlflow/0/meta.yaml b/data/classification/mlflow/0/meta.yaml
new file mode 100644
index 00000000..0c88f710
--- /dev/null
+++ b/data/classification/mlflow/0/meta.yaml
@@ -0,0 +1,4 @@
+artifact_location: data/classification/mlflow/0
+experiment_id: '0'
+lifecycle_stage: active
+name: Default
diff --git a/data/dimensionality_reduction/pipeline.pickle b/data/dimensionality_reduction/pipeline.pickle
index 566baf5a..8fa8db23 100644
Binary files a/data/dimensionality_reduction/pipeline.pickle and b/data/dimensionality_reduction/pipeline.pickle differ
diff --git a/data/dimensionality_reduction/test.pickle b/data/dimensionality_reduction/test.pickle
index 40ffb175..e901e0fe 100644
Binary files a/data/dimensionality_reduction/test.pickle and b/data/dimensionality_reduction/test.pickle differ
diff --git a/data/dimensionality_reduction/training.pickle b/data/dimensionality_reduction/training.pickle
index f87bd9b5..e2cdee9c 100644
Binary files a/data/dimensionality_reduction/training.pickle and b/data/dimensionality_reduction/training.pickle differ
diff --git a/data/dimensionality_reduction/validation.pickle b/data/dimensionality_reduction/validation.pickle
index d3ced73e..38d0f914 100644
Binary files a/data/dimensionality_reduction/validation.pickle and b/data/dimensionality_reduction/validation.pickle differ
diff --git a/data/feature_extraction/pipeline.pickle b/data/feature_extraction/pipeline.pickle
index e7be5a45..8eecbf0b 100644
Binary files a/data/feature_extraction/pipeline.pickle and b/data/feature_extraction/pipeline.pickle differ
diff --git a/data/feature_extraction/test.pickle b/data/feature_extraction/test.pickle
index a96dba62..51e8a6c7 100644
Binary files a/data/feature_extraction/test.pickle and b/data/feature_extraction/test.pickle differ
diff --git a/data/feature_extraction/training.pickle b/data/feature_extraction/training.pickle
index df5d0d53..72fa5a82 100644
Binary files a/data/feature_extraction/training.pickle and b/data/feature_extraction/training.pickle differ
diff --git a/data/feature_extraction/validation.pickle b/data/feature_extraction/validation.pickle
index f3c5ced4..af883042 100644
Binary files a/data/feature_extraction/validation.pickle and b/data/feature_extraction/validation.pickle differ
diff --git a/data/gridsearch_results.csv b/data/gridsearch_results.csv
new file mode 100644
index 00000000..61a79310
--- /dev/null
+++ b/data/gridsearch_results.csv
@@ -0,0 +1,45 @@
+,param_min_samples_split,param_n_estimators,mean_test_cohen_kappa,rank_test_cohen_kappa,mean_test_rec,rank_test_rec,mean_test_prec,rank_test_prec,rank_sum
+6,2,77,0.21032947876250868,2,0.17511379256482348,6,0.44951794430505243,32,40
+1,2,17,0.21063906275551716,1,0.1831133434864105,2,0.4230331961941095,41,44
+5,2,65,0.20886364936456098,3,0.17442633161512028,7,0.4460091484599403,35,45
+3,2,41,0.20799252732722429,5,0.17567646829115902,5,0.43780214161181774,38,48
+7,2,89,0.2085003951239579,4,0.17405154639175255,8,0.44581997443371807,36,48
+10,2,125,0.20731297949198169,6,0.17167662449234614,12,0.4500504796260829,31,49
+8,2,101,0.2071892951061664,7,0.17217652686660417,10,0.44772178217165903,34,51
+9,2,113,0.20710088909630747,8,0.1718643783192752,11,0.44838625471423976,33,52
+2,2,29,0.20571535678910546,10,0.17617639019056547,4,0.42701155533889956,40,54
+17,4,77,0.1971604900640554,14,0.15286519837550766,19,0.49064274582341855,21,54
+4,2,53,0.20625278992852097,9,0.1736140073414558,9,0.4382647790171278,37,55
+12,4,17,0.1981555959034252,12,0.16036500312402374,14,0.45686773053605945,30,56
+0,2,5,0.20083369740170615,11,0.19761252343017807,1,0.35719418885681564,44,56
+15,4,53,0.19638446197768816,15,0.15286537410184317,18,0.4862803411117932,25,58
+20,4,113,0.19545524418296217,18,0.15086535457669478,21,0.4926928538343057,20,59
+11,4,5,0.19727602548922335,13,0.17905135114026866,3,0.3879463628549637,43,59
+13,4,29,0.19618057879279197,16,0.15517736644798502,15,0.4726763266879991,29,60
+16,4,65,0.19614263548807906,17,0.1529275812246173,17,0.484532411844388,27,61
+14,4,41,0.19483626799678944,19,0.15324025695095284,16,0.4756330683961238,28,63
+19,4,101,0.194696442075626,20,0.15067789362699158,22,0.48962813526484605,22,64
+28,6,77,0.18249127068575738,26,0.13405400656044988,27,0.5273419139893736,11,64
+31,6,113,0.18018176000442848,28,0.13192900656044987,30,0.5282045487863226,9,67
+21,4,125,0.19349336004881384,21,0.15011541315213997,23,0.48592285968464155,26,70
+18,4,89,0.19312470874829782,22,0.14936547172758513,24,0.48813429514750606,24,70
+23,6,17,0.18464780068276904,24,0.1413030498281787,25,0.48849086245569795,23,72
+24,6,29,0.18152881641188415,27,0.13617883083411436,26,0.5036483966971799,19,72
+30,6,101,0.17899175227023506,29,0.1313666432364886,31,0.5236574106102454,12,72
+26,6,53,0.17898975468828954,30,0.13242883083411433,29,0.5147323279525795,16,75
+25,6,41,0.1789529394817339,31,0.1333666822867854,28,0.506941666585039,18,77
+27,6,65,0.17823922967001732,32,0.13124174086223056,32,0.5192187107953378,14,78
+22,6,5,0.18939261745430072,23,0.16267734692283659,13,0.40930000263148186,42,78
+38,8,65,0.16666234457231224,38,0.11867974851608873,38,0.5435948637265622,5,81
+29,6,89,0.17659563337634432,33,0.12986660418619184,33,0.5185023064539108,15,81
+32,6,125,0.17505195691465664,34,0.1279918580131209,34,0.5227362189308332,13,81
+36,8,41,0.16667684093083346,37,0.11942978756638552,37,0.5357472248948277,8,82
+43,8,125,0.1646164041526758,40,0.11630515854420494,41,0.5514007376793099,1,82
+35,8,29,0.16932657583427738,36,0.12249217041549516,36,0.5278215384037906,10,82
+33,8,5,0.1836505649308598,25,0.15117810840362386,20,0.4296789841567411,39,84
+37,8,53,0.16484733926096468,39,0.1178675413933146,39,0.5361497397393733,7,85
+34,8,17,0.17051946516429975,35,0.12517933848797252,35,0.5123416980008006,17,87
+39,8,77,0.16404765853571726,41,0.11699222899094033,40,0.5389020817338767,6,87
+41,8,101,0.1638946904668479,42,0.11611746329272103,42,0.5463617966640933,3,87
+42,8,113,0.16361556728538118,43,0.11574263901905654,44,0.5484006612819178,2,89
+40,8,89,0.16327155598947937,44,0.11586730709153388,43,0.5436761259312005,4,91
diff --git a/docs/Documentation.md b/docs/Documentation.md
new file mode 100644
index 00000000..12c00167
--- /dev/null
+++ b/docs/Documentation.md
@@ -0,0 +1,163 @@
+# Documentation - [Patoali](https://trello.com/b/3pj6SkWa)
+
+This document presents the author's work on the 'Machine Learning in Practice' project which took place during the summer term 2021 as a block seminar at Osnabrück University. The given task was to analyze a data set containing data science-related tweets and predict whether a tweet will go viral or not by applying machine learning techniques. A tweet is defined as viral if it exceeds the arbitrary threshold of the sum of 50 likes and retweets. The data set _Data Science Tweets 2010-2021_ contains _data science_, _data analysis_, and _data visualization_ tweets from verified accounts on Twitter from 2010 until 2021. It was collected and [shared on kaggle.com](https://www.kaggle.com/ruchi798/data-science-tweets) by Ruchi Bhatia.
+
+The lecturer Lucas Bechberger provided his students with a foundational codebase that makes heavy use of the python library scikit-learn. The codebase consists of multiple python (`.py`) and bash (`.sh`) scripts that resemble a basic pipeline of the processing steps _preprocessing_, _feature extraction_, _dimensionality reduction_, and _classification_ which is common for machine learning projects. The shell scripts invoke the python scripts with a particular set of command-line arguments. Shell scripts can be used to run the entire pipeline or to execute only individual steps to save time. Results of the pipeline steps are stored in `.pickle` files to reuse them in a separate application. The application offers a rudimentary read–eval–print loop to predict the virality of the tweet a user inputs. The task of the students is to understand the code base and extend or replace given placeholder implementations with proper solutions to measure and improve the virality prediction.
+
+## Evaluation
+
+Before taking a look at the implemented metrics for judging the prediction performance of various models, some specifics about the data set at hand need to be considered. The raw data consists of the three `.csv` files _data science_, _data analysis_, and _data visualization_. In a first preprocessing step they are appended respectively to form one big data set. In the next step the data is labeled as viral or not viral according to the above-mentioned threshold rule. The resulting data set consists of 295,811 tweet records with a distribution of 90.82% non-viral and 9.18% viral tweets. Such an uneven distribution of labeling classes is often referred to as an imbalanced data set. This fact has to be taken into account when comparing the results of baselines with classifiers and the selection of suitable metrics.
+
+<p align="center">
+    <img id="baselines" src="./imgs/baselines_2021-11-03_231550.png" alt="">
+</p>
+<p align="center">Fig. 1. Shows the performance of the sklearn DummyClassifier with the strategies 'stratified' and 'most_frequent' on a training and validation data set for all implemented metrics.</p>
+
+For the baselines, a `DummyClassifier` from the sklearn module was used with the `strategy` `most_frequent` and `stratified`. The former determines non-viral tweets as the most frequent class and therefore predicts every sample as non-viral. [Fig. 1](#baselines) shows that this rather dumb prediction strategy results in a high accuracy of 90.6%. This is the case because the calculation of the accuracy metric is based on how many predictions have been correct. Since the data set contains mostly non-viral tweets, the prediction is correct most of the time with a percentage that is similar to the data set's class distribution. The slight difference in the percentage can be explained by the removal of some samples during the preprocessing step.
+
+The `stratified` strategy predicts by respecting the training set’s class distribution. Again the accuracy has a high value of 83.2% on the validation set. In two observations the accuracy metric performs well on baselines indicating that it is not useful for the imbalanced data set and therefore can be dismissed entirely. The other metrics _Precision_, _Recall_, _F1-score_, _Cohen's Kappa coefficient_, and _Jaccard score_ are not null this time but still have a very low value roughly between 0 and 0.1 which is a bad result. Some considerations about the remaining metrics are discussed in the following paragraphs.
+
+When selecting metrics, the use case should be taken into account. An average Twitter user would expect that most send tweets will not go viral. When such a user would type a potential tweet into our application to find out if it is going to be viral, it is important to detect a tweet that would go viral as such. This can be captured by the recall metric which asks the question _"How many of the true positives did I catch?"_. On the other hand, it would be annoying if the application is not critical enough and classifies a lot of tweets as viral that don't go viral in practice. Such a high rate of false positives is captured by the precision metric which asks _"How many positively classified ones are actually positive?"_. Therefore, both recall and precision are good metrics for the use case. As an addition, the F1-score, which combines both recall and precision as a weighted average, is also used.
+
+Furthermore, Cohen's Kappa is a good candidate for an imbalanced data set. In its calculation, the accuracy is used, but adjusted by the probability of random agreement and therefore considered as a more robust measure than simple percent agreement calculations. Additionally, the Jaccard score leaves out false negatives in its calculation. Since it can be expected that this is the most frequently appearing type of result in a confusion matrix, the Jaccard score is also well-suited for the data set. All in all, the metrics _Cohen's Kappa_, _F1-score_, _Jaccard score_, _precision_, and _recall_ are used to judge the model's prediction performance by comparing the scores of the model two the scores of the chosen baselines.
+
+## Preprocessing
+
+This section explains what kind of preprocessing operations are applied to prepare the data for the feature extraction and training in later steps of the pipeline to achieve the best possible prediction performance.
+
+### Preprocessing in General & Provided Preprocessors
+
+After the above-mentioned actions of appending the raw data set to a big one and labeling it, a few more preprocessing operations are performed. These operations are useful to improve the performance of models. Usually, models cannot process prose or text in general and therefore need the textual data to be transformed into numerical values. The applied operations are organized in their own preprocessor classes by inheriting from the sklearn classes `BaseEstimator` and `TransformerMixin`. The first two preprocessors `PunctuationRemover` and `Tokenizer` were already provided by the lecturer. The former removes punctuation ASCII characters from the tweet column and saves the result in a new column. This should result in mostly pure text. The latter takes a text column as the input and splits the text into an array of one word per element. The resulting data of both preprocessors is well-suited to perform NLP techniques the data during the feature extraction step.
+
+### NonEnglishRemover & ColumnDropper
+
+Two more processors are implemented. First, the `NonEnglishRemover` removes all data rows that are labeled as being non-English. This was done after exploring and visualizing the data set in [`visualization.py`](../src/visualization.py). As can be seen in [Fig. 2](#tweets-language) the majority of tweets is labeled as English (95.57%). The removal of non-English tweets is useful because most pre-trained NLP models or other NLP techniques are optimized for English texts. Additionally, the next biggest language only has 3492 records, which is too little to perform any meaningful machine learning on it. It should be noted though, that there are still some non-English tweets in the data set after performing the operation because they were labeled wrong. Because this mislabeling is seldom, they can be regarded as noise and must not be further taken into account.
+Second, the `ColumnDropper` removes columns that are not needed. This is simply for the convenience of having fewer columns when looking at the preprocessed data set which is saved as an intermediate data set.
+
+<p align="center">
+    <img id="tweets-language" src="./imgs/distribution_of_tweets_per_language.png" alt="">
+</p>
+<p align="center">Fig. 2. The majority of tweet records are labeled as English. The amount of non-English tweets is too small to be useful for machine learning.</p>
+
+### TweetCleaner (removes hashtags, URLs, and usernames)
+
+Further obvious preprocessing operations are the removal of hashtags, URLs, and Twitter usernames from the tweet because various NLP techniques would otherwise come across unknown words and expressions which would decrease the performance. The implementation of this preprocessor was done in the `Tweetclean` branch, but not entirely finished and therefore not used in any features. Just for the record, the preprocessor was mostly implemented by a team member who dropped out of the course.
+
+## Feature Extraction
+
+Besides the already given `CharacterLengthFE`, two more feature extractors have been implemented, namely `CounterFE` and `SentimentFE`. The former is applied to multiple columns and thereby creates multiple features. In general, the feature extractors inherit from a custom `FeatureExtractor` class which in turn inherits from the sklearn classes `BaseEstimator` and `TransformerMixin`. This is done to collect the features and store them in a `.pickle` file for later use in the application.
+
+### CounterFE
+
+Parses the string in every cell of the column/series as an array
+        and counts the length in the cell of the output column
+
+The `CounterFE` takes an input column that contains in principle a list of items and counts them. But because the data set is read from file the list in every data cell of the column is wrapped in double quotes as a string. Therefore the string needs to be parsed as a python list by applying (`pandas.DataFrame.apply` [^1]) the `litera_eval` function [^2] from the `ast` package. Then the length of the list for each cell is saved by outputting a column with single integer values in a new column. The new column keeps the original name plus an appended `_count`. 
+
+The data set contains the countable columns *mentions*, *photos*, *hashtags*, *urls*, *cashtags*, *reply_to*, and *tweet_tokenized*. The motivation for choosing these columns was to gain information about the virality of a tweet by counting for example the number of photos, used hashtags, and URLs. The following table shows an example of what the input for the hashtag column could look like and the corresponding output of the feature extractor.
+
+| input coulmn                                                      | output column |
+|-------------------------------------------------------------------|-------------- |
+| "['energy', 'visualization', 'data']"                             | 3             |
+| "[]"                                                              | 0             |
+| "['flutter', 'webdevelopment', 'mobiledev', 'datavisualization']" | 4             |
+
+### SentimentFE
+
+The `SentimentFE` feature extractor makes use of a sentiment analyzer. The idea is to analyze the sentiment of the tweet text itself by passing it as an input column. It is assumed that tweets which elicit a strong positive or negative emotion are more likely to go viral. For this task, the VADER (Valence Aware Dictionary and sEntiment Reasoner) analysis tool was used. It is a lexicon and rule-based tool specifically attuned to sentiments expressed in social media. [^3] VADER even supports slang and takes the author's emphasis by capitalizing words into account. Therefore, it should be well-suited for analyzing tweets.
+
+VADER is applied to the tweet and automatically analyzes the sentiment of the whole tweet, even if it contains multiple sentences. The calculated *positive*, *neutral* and *negative* polarity scores are floating-point numbers between 0 and 1 that sum up to a total of 1. Only the positive and negative scores are used since the neutral score is always the remaining value, to sum up to 1 and therefore contains no new information. Both scores are stored in the output column. It is noteworthy that VADER also offers a compound score which is a normalized, weighted composite score in a single number between -1 and 1. The following table shows a few example input texts and their corresponding VADER scores. Using the compound score value probably would have  been a better alternative to using the positive and negative score because the first two examples demonstrate how the compound score takes capitalization emphasis into account which the positive score seems to not do.
+
+| input                                                    | output                                                        |
+|----------------------------------------------------------|---------------------------------------------------------------|
+| VADER is smart, handsome, and funny.                     | {'pos': 0.746, 'compound': 0.8316, 'neu': 0.254, 'neg': 0.0}  |
+| VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!! | {'pos': 0.706, 'compound': 0.9469, 'neu': 0.294, 'neg': 0.0}  |
+| Today SUX!                                               | {'pos': 0.0, 'compound': -0.5461, 'neu': 0.221, 'neg': 0.779} |
+| Catch utf-8 emoji such as 💘 and 💋 and 😁              | {'pos': 0.279, 'compound': 0.7003, 'neu': 0.721, 'neg': 0.0}  |
+| Not bad at all                                           | {'pos': 0.487, 'compound': 0.431, 'neu': 0.513, 'neg': 0.0}   |
+
+## Classification & Results
+
+This section gives insights into experimentations with dimensionality reduction and the motivation for using a k-nearest neighbors and a random forest classifier for the prediction task. Furthermore, the first results are presented. Also, two types of grid searches have been implemented and their results are discussed. Finally, a verdict about the overall performance is expressed, especially in comparison to the baselines. Finally, doubts about the usefulness of the application in the current state are expressed.
+
+### Dimensionality Reduction
+
+First off, experiments with the already implemented dimensionality reduction technique `SelectKBest` are done. When selecting the two best (`k=2`) best features, the resulting non-accuracy metrics after training are all 0. This shows that the two best features alone don't contain enough information for any learning to happen. Therefore, the argument `all` is used to ignore the dimensionality reduction and select every feature instead. The amount of features is still small enough so that simply training on all features is possible in a reasonable time.
+
+### KNeighborsClassifier & RandomForestClassifier
+
+The implemented classifiers are `KNeighborsClassifier` (k-nearest neighbors) and `RandomForestClassifier`. The former was already provided by the lecturer; the latter was chosen because a random forest can be considered as a kind of universal machine learning technique that works very well out of the box on a multitude of data sets according to Jeremy Howard. He further describes some of the properties of a random forest in his introductory Machine Learning lecture on YouTube: [^4]
+- It can predict multiple data types like categories (classification) or continuous variables (regression).
+- It can predict with both structured and unstructured data type columns like pixels, zip codes, revenues, and so on. 
+- In general, a separate validation set is not needed, since it generalizes well even if there is only one data set present.
+- It generally does not overfit too badly and it is very easy to prevent overfitting.
+- It makes few, if any, statistical assumptions like assuming that the data is normally distributed, that the relationship is linear, or that you have certain interactions.
+- Feature engineering processing like taking the log of the data or multiplying interactions together is not needed.
+
+So all in all, a random forest classifier seems like a good first candidate to implement when no strong assumptions are made about the data.
+
+### Early Results and Their Improvement Through Additional Features
+
+Initially, only features for the length of the mentions and photos column as well as the character length of a tweet were implemented. The last four rows in [Fig. 3](#early-results) show the results of the classification at this point in time for the k-nearest neighbors classifier (KNN) with `k=1` and the random forest (RF) classifier with `n=10` estimators, meaning the number of trees in the forest. Other than that default parameters were used. The metrics for both the training and validation set are quite bad. For example, the Cohen's Kappa coefficient for both classifiers barely reaches the 0.1 mark on the training set. For the validation set, it is even a bit worse. Surprisingly, the precision for a RF has a decent score of 0.69 on the training set and 0.40 on the validation set. Since the performance on both training and validation sets is poor, the models are underfitting and more features are needed.
+
+<p align="center">
+    <img id="early-results" src="./imgs/after_more_count_cropped2.png" alt="">
+</p>
+<p align="center">Fig. 3. Adding more features increases the performance on the training set a lot and slightly improves the performance on the validation set. While a decent performance on the training set is measured, for the validation set it is still quite bad which is a sign of overfitting.</p>
+
+After adding the sentiment feature the metrics improved a lot on the training set and a bit on the validation set. For example, the Cohen's Kappa is at roughly 0.60 on the training set and between 0.11 and 0.12 on the validation set. The performance on the validation set is still relatively poor. Without taking precision into account, the metrics have approximately a value between 0.1 and 0.2. The precision is even worse than before with 0.32. The results show that the models can learn something on the training set but this doesn't generalize well on the validation set. So the models could be overfitting because the parameters are chosen too high. But this is not the case for the KNN classifier. Therefore, it is assumed that the features contain too little information for proper learning to happen.
+
+Adding the remaining count features – mentioned in the [feature extraction](#CounterFE) section – improves the metrics for the training set to reasonably good scores. Most metrics are in a range between 0.71 and 0.88. Even though the performance on the validation set has improved a bit, it still has to be rated as rather poor overall. Most metrics are in the range of 0.14 and 0.27, so in summary even worse than flipping a coin. Since the model parameters are chosen relatively low, which is especially true for the KNN classifier, it can be assumed that the implemented features just don't contain enough meaningful information to predict the virality of tweets.
+
+### Hyperparameter Optimization with GridSearchCV and Manual Grid Search
+
+To figure out whether a different configuration of parameters would improve the performance of the validation set, a grid search is implemented in two variants. One variant is implemented in `classification_hyper_param.sh` by iteratively invoking the `run_classifier.py` script with different parameter values for the respective classifier in each iteration. The KNN classifier is invoked with `k` values in a range from 1 to 10 and the RF with `n` in a range from 1 to 10 in steps of 1 as well as 10 to 82 in steps of 3. The second implementation uses the `GridSearchCV` class from sklearn to explore the best values for the parameters `criterion`, `min_samples_split`, and `n_estimators`. A multi-metric setup with Cohen's Kappa, recall and precision as well as a refit on Cohen's Kappa is configured. By default, a 5-fold cross-validation is used to evaluate the performance.
+
+Multiple experimentations with the sklearn grid search revealed no significant difference between the two criterions *gini* and *entropy*. Therefore, another run is performed with the default criterion (*gini*), `min_samples_split` with the range 2, 4, 6, and 8 as well as a `n_estimators` from 5 to 125 in steps of 12. The results show that `n_estimators` has relatively little impact on the metric scores. Instead, there seems to be a somewhat linear correlation and negative correlation between the number of `min_samples_split` and the metrics. The highest mean Cohen's Kappa coefficient of 0.211 is achieved at a split of 2 and the lowest at 8 with 0.163. In the same fashion, the mean recall reaches 0.183 and 0.116 for 2 and 8 respectively. For precision, there is an inverse relationship so that 2 gives the worst score 0.423 and 8 gives the best with 0.544. It can be concluded that the selection of the `min_samples_split` parameter value is a tradeoff between a good precision score and the other metric scores. The results are saved as [gridsearch_results_crit_1-125-12steps_min2468.csv](results/gridsearch_results_crit_1-125-12steps_min2468.csv).
+
+For the manual grid search, a `min_samples_split` of 2 is chosen so that the individual metrics would have a rather good balanced value, instead of having a high precision value in relation to the other metrics which would also lower the other metrics as a side-effect. The results of the run on the validation set show once again a linear relationship between Cohen's Cappa, recall, and precision. Furthermore, it is noticeable that the highest scores for Cohen's Cappa are achieved by a random forest with `n` between 9 and 82. Because the scores of Cohen's Cappa converge beginning with 9, everything above can be considered as a too high model capacity and starting to overfit. Because of the linear relationship, the previously mentioned metrics are more or less the same. Therefore the F1-Score and Jaccard Score are more interesting. For both metrics, the best classifier is KNN with `k=1` and the second-best RF with `n=9`. The following table shows the results for both. Also, all results are saved as [manual_gridsearch_all.csv](results/manual_gridsearch_all.csv).
+
+| Accuracy | Kappa | F1 | Jaccard | Precision | Recall | classifier   | data set   | k | n |
+|----------|-------------|----------|---------|-----------|--------|--------------|------------|---|---|
+| 0.862    | 0.194       | 0.27     | 0.156   | 0.27      | 0.271  | knn          | validation | 1 | - |
+| 0.94     | 0.637       | 0.67     | 0.504   | 0.696     | 0.646  | knn          | training   | 1 | - |
+| 0.896    | 0.212       | 0.261    | 0.15    | 0.393     | 0.196  | randomforest | validation | - | 9 |
+| 0.973    | 0.833       | 0.847    | 0.735   | 0.928     | 0.779  | randomforest | training   | - | 9 |
+
+## Conclusion
+
+The analysis of the data set showed that it is imbalanced and therefore accuracy is a bad metric for evaluating the prediction performance and dismissed. The addition of more features did improve the performance on the training set a lot, but only slightly on the validation set. This is a sign of overfitting. Various search grid runs showed that the used classifiers and parameters were already almost optimal. These hyperparameter optimization runs also revealed that the prediction performance could only be slightly nudged upwards. Most metrics remain between 0.15 and 0.39. All in all, it can be concluded that the implemented features extract too little information to make good predictions about the virality of tweets. This is understandable since most features are only taking metadata of tweets into account, but not the actual content of the tweet itself. At least from a user perspective, it would make sense that the actual content of a tweet matters the most.
+
+More work is needed to create proper NLP features that analyze and extract information from the actual content of the tweet. For example, this could be the extraction of names and entities with named entity recognition or the analysis of the term frequency and topic with techniques like TF-IDF and topic modeling. So finding out which topics are relevant and how often they appear in comparison to the overall distribution across all tweets in the data set could be relevant. Furthermore, it was probably a mistake to use the positive and negative scores of the VADER sentiment analyzer, because the compound score gives a better representation of the sentiment by taking emphasis into account.
+
+| Accuracy | Kappa | F1 | Jaccard | Precision | Recall | classifier   | data set   | k | n |
+|----------|-------------|----------|---------|-----------|--------|--------------|------------|---|---|
+| 0.83    | 0.002       | 0.095     | 0.05   | 0.096      | 0.095  | stratified          | test | 1 | - |
+| 0.863     | 0.199      | 0.274     | 0.159   | 0.274     | 0.274  | knn          | test   | 1 | - |
+| 0.897     | 0.22      | 0.269     | 0.155   | 0.406     | 0.201  | random forest          | test   | - | 9 |
+
+
+The table above shows that both selected models perform pretty much the same or even slightly better on the test set in comparison to the validation set. The models outperform the stratified baseline by roughly one to three tenths in all metrics. However, all in all, the model's performance still seems too poor to be a useful tool in a production application because the prediction performance for the validation set on all metrics except accuracy is way below 50%. Flipping an unfair coin that predicts in 90% of the cases that a tweet will not go viral, will probably give similar good or bad predictions as the application. Therefore, it makes no sense to publish the application to users. More work on creating good features would be needed to improve the performance and thereby make the application viable.
+
+## Testing
+
+Four additional tests have been implemented to ensure the intended functionality of the metrics calculations and two features extractors.  The tests are located in a subfolder of the respective implementation that is being tested, so that test and implementation remain close to each other.
+
+- [metrics_test.py](..\src\classification\test\metrics_test.py)
+- [count_test.py](..\src\feature_extraction\test\count_test.py)
+- [sentiment_test.py](..\src\feature_extraction\test\sentiment_test.py)
+- [tweet_cleaner_test.py](..\src\preprocessing\test\tweet_cleaner_test.py) (branch `Tweetclean`):
+
+---
+
+## References
+
+[^1]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html
+
+[^2]: https://docs.python.org/3/library/ast.html#ast.literal_eval
+
+[^3]: Taken from https://github.com/cjhutto/vaderSentiment#vader-sentiment-analysis
+
+    VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. It is fully open-sourced under the [MIT License] (we sincerely appreciate all attributions and readily accept most contributions, but please don't hold us liable).
+
+[^4]: Introduction to Machine Learning for Coders Online Course by Jeremy Howard. https://www.youtube.com/watch?v=CzdWqFTmn0Y&t=2197s
\ No newline at end of file
diff --git a/Documentation.md b/docs/Documentation_template.md
similarity index 100%
rename from Documentation.md
rename to docs/Documentation_template.md
diff --git a/docs/imgs/after_more_count.png b/docs/imgs/after_more_count.png
new file mode 100644
index 00000000..748446e8
Binary files /dev/null and b/docs/imgs/after_more_count.png differ
diff --git a/docs/imgs/after_more_count_cropped.png b/docs/imgs/after_more_count_cropped.png
new file mode 100644
index 00000000..39a90d21
Binary files /dev/null and b/docs/imgs/after_more_count_cropped.png differ
diff --git a/docs/imgs/after_more_count_cropped2.png b/docs/imgs/after_more_count_cropped2.png
new file mode 100644
index 00000000..88eee5a4
Binary files /dev/null and b/docs/imgs/after_more_count_cropped2.png differ
diff --git a/docs/imgs/after_sentiment_2021-11-03_231550.png b/docs/imgs/after_sentiment_2021-11-03_231550.png
new file mode 100644
index 00000000..76758cc3
Binary files /dev/null and b/docs/imgs/after_sentiment_2021-11-03_231550.png differ
diff --git a/docs/imgs/baselines_2021-11-03_231550.png b/docs/imgs/baselines_2021-11-03_231550.png
new file mode 100644
index 00000000..2a5d8a38
Binary files /dev/null and b/docs/imgs/baselines_2021-11-03_231550.png differ
diff --git a/docs/imgs/distribution_of_tweets_per_language.png b/docs/imgs/distribution_of_tweets_per_language.png
new file mode 100644
index 00000000..3021b72c
Binary files /dev/null and b/docs/imgs/distribution_of_tweets_per_language.png differ
diff --git a/docs/imgs/manual_hyper_param_results.png b/docs/imgs/manual_hyper_param_results.png
new file mode 100644
index 00000000..3c154091
Binary files /dev/null and b/docs/imgs/manual_hyper_param_results.png differ
diff --git a/docs/notes.md b/docs/notes.md
new file mode 100644
index 00000000..c62a327b
--- /dev/null
+++ b/docs/notes.md
@@ -0,0 +1,79 @@
+
+
+Can I link to the image ? 
+
+- o [hard coded domainrepo+ branch + file/page](https://github.com/TobiObeck/MLinPractice/blob/documentation-2/docs/Documentation.mdtweets-language)
+- o [hard coded file/page](Documentation.mdtweets-language)
+- o [hard coded file/page](./Documentation.mdtweets-language)
+- [full path?](/docs/Documentation.mdtweets-language)
+- [opens docs folder](./tweets-language)
+- ++ [relative same page](tweets-language)
+- + <a href="tweets-language">relative same page but html</a>
+
+![asd](imgs/after_sentiment_2021-11-03_231550.png " ")
+
+
+# visualization.py
+
+![asd](imgs/distribution_of_tweets_per_language.png " ")
+
+
+
+- Number of tweets: 295811
+- Label distribution:
+- False    0.908185
+- True     0.091815
+
+
+| en | 283240 |
+|----|--------|
+| es | 3492   |
+| fr | 3287   |
+| de | 811    |
+| it | 748    |
+
+- 283240 english tweets
+- 12571 non english tweets
+- 4.438 percent non-english
+- 4.4382855528880105
+
+
+Here's a simple footnote,[^1] and here's a longer one.[^bignote]. And another one [^2] here
+
+
+<p align="center">
+    <img id="distribution-of-tweets-per-language" src="./imgs/distribution_of_tweets_per_language.png" alt="">
+
+Fig. 2: The majority of tweet records are labelled as english. The amount of non-english tweets is too small to be usefull for machine learning.
+</p>
+
+
+---
+
+## References
+
+[^1]: This is the first footnote.
+
+[^1]: This is the first footnote.
+
+[^bignote]: Here's one with multiple paragraphs and code.
+
+    Indent paragraphs to include them in the footnote.
+
+    `{ my code }`
+
+    Add as many paragraphs as you like.
+
+[^2]: This is the first footnote.
+
+| index | param_criterion | param_min_samples_split | param_n_estimators | mean_test_cohen_kappa | rank_test_cohen_kappa | mean_test_rec | rank_test_rec | mean_test_prec | rank_test_prec | rank_sum |
+|-------|-----------------|-------------------------|--------------------|-----------------------|-----------------------|---------------|---------------|----------------|----------------|----------|
+| 19    | entropy         | 5                       | 101                | 0.906                 | 2                     | 0.144         | 35            | 0.509          | 2              | 39       |
+| 20    | entropy         | 5                       | 121                | 0.906                 | 3                     | 0.143         | 36            | 0.508          | 3              | 42       |
+| 41    | gini            | 5                       | 121                | 0.907                 | 1                     | 0.140         | 42            | 0.514          | 1              | 44       |
+| 32    | gini            | 4                       | 81                 | 0.905                 | 11                    | 0.153         | 24            | 0.491          | 10             | 45       |
+| ...   | ...             | ...                     | ...                | ...                   | ...                   | ...           | ...           | ...            | ...            | ...      |
+| 14    | entropy         | 5                       | 1                  | 0.867                 | 38                    | 0.217         | 5             | 0.258          | 40             | 83       |
+| 29    | gini            | 4                       | 21                 | 0.903                 | 31                    | 0.158         | 20            | 0.462          | 32             | 83       |
+| 0     | entropy         | 3                       | 1                  | 0.866                 | 41                    | 0.225         | 3             | 0.257          | 41             | 85       |
+| 21    | gini            | 3                       | 1                  | 0.864                 | 42                    | 0.225         | 2             | 0.250          | 42             | 86       |
\ No newline at end of file
diff --git a/docs/results/final_classification_result_on_test_set.png b/docs/results/final_classification_result_on_test_set.png
new file mode 100644
index 00000000..d8231f19
Binary files /dev/null and b/docs/results/final_classification_result_on_test_set.png differ
diff --git a/docs/results/gridsearch_results_crit_1-125-12steps_min2468.csv b/docs/results/gridsearch_results_crit_1-125-12steps_min2468.csv
new file mode 100644
index 00000000..46ee5233
--- /dev/null
+++ b/docs/results/gridsearch_results_crit_1-125-12steps_min2468.csv
@@ -0,0 +1,45 @@
+index,param_min_samples_split,param_n_estimators,mean_test_cohen_kappa,rank_test_cohen_kappa,mean_test_rec,rank_test_rec,mean_test_prec,rank_test_prec,rank_sum,metrics_sum
+6,2,77,0.210,2,0.175,6,0.450,32,40,0.835
+1,2,17,0.211,1,0.183,2,0.423,41,44,0.817
+5,2,65,0.209,3,0.174,7,0.446,35,45,0.829
+3,2,41,0.208,5,0.176,5,0.438,38,48,0.821
+7,2,89,0.209,4,0.174,8,0.446,36,48,0.828
+10,2,125,0.207,6,0.172,12,0.450,31,49,0.829
+8,2,101,0.207,7,0.172,10,0.448,34,51,0.827
+9,2,113,0.207,8,0.172,11,0.448,33,52,0.827
+2,2,29,0.206,10,0.176,4,0.427,40,54,0.809
+17,4,77,0.197,14,0.153,19,0.491,21,54,0.841
+4,2,53,0.206,9,0.174,9,0.438,37,55,0.818
+12,4,17,0.198,12,0.160,14,0.457,30,56,0.815
+0,2,5,0.201,11,0.198,1,0.357,44,56,0.756
+15,4,53,0.196,15,0.153,18,0.486,25,58,0.836
+20,4,113,0.195,18,0.151,21,0.493,20,59,0.839
+11,4,5,0.197,13,0.179,3,0.388,43,59,0.764
+13,4,29,0.196,16,0.155,15,0.473,29,60,0.824
+16,4,65,0.196,17,0.153,17,0.485,27,61,0.834
+14,4,41,0.195,19,0.153,16,0.476,28,63,0.824
+19,4,101,0.195,20,0.151,22,0.490,22,64,0.835
+28,6,77,0.182,26,0.134,27,0.527,11,64,0.844
+31,6,113,0.180,28,0.132,30,0.528,9,67,0.840
+21,4,125,0.193,21,0.150,23,0.486,26,70,0.830
+18,4,89,0.193,22,0.149,24,0.488,24,70,0.831
+23,6,17,0.185,24,0.141,25,0.488,23,72,0.814
+24,6,29,0.182,27,0.136,26,0.504,19,72,0.821
+30,6,101,0.179,29,0.131,31,0.524,12,72,0.834
+26,6,53,0.179,30,0.132,29,0.515,16,75,0.826
+25,6,41,0.179,31,0.133,28,0.507,18,77,0.819
+27,6,65,0.178,32,0.131,32,0.519,14,78,0.829
+22,6,5,0.189,23,0.163,13,0.409,42,78,0.761
+38,8,65,0.167,38,0.119,38,0.544,5,81,0.829
+29,6,89,0.177,33,0.130,33,0.519,15,81,0.825
+32,6,125,0.175,34,0.128,34,0.523,13,81,0.826
+36,8,41,0.167,37,0.119,37,0.536,8,82,0.822
+43,8,125,0.165,40,0.116,41,0.551,1,82,0.832
+35,8,29,0.169,36,0.122,36,0.528,10,82,0.820
+33,8,5,0.184,25,0.151,20,0.430,39,84,0.765
+37,8,53,0.165,39,0.118,39,0.536,7,85,0.819
+34,8,17,0.171,35,0.125,35,0.512,17,87,0.808
+39,8,77,0.164,41,0.117,40,0.539,6,87,0.820
+41,8,101,0.164,42,0.116,42,0.546,3,87,0.826
+42,8,113,0.164,43,0.116,44,0.548,2,89,0.828
+40,8,89,0.163,44,0.116,43,0.544,4,91,0.823
diff --git a/docs/results/manual_gridsearch_all.csv b/docs/results/manual_gridsearch_all.csv
new file mode 100644
index 00000000..7a0351ec
--- /dev/null
+++ b/docs/results/manual_gridsearch_all.csv
@@ -0,0 +1,89 @@
+Name,classifier,dataset,k,n,Accuracy,Cohen_kappa,F1-Score,Jaccard,Precision,Recall
+RF with n=9,randomforest,validation,,9,0.8957597796921338,0.2118527561386111,0.26122857500312774,0.1502374442365808,0.39262880782248966,0.19572553430821146
+RF with n=9,randomforest,training,,9,0.9734971520030128,0.8325826103871263,0.8469692851318292,0.7345591702027345,0.9280023825478371,0.7789513155427785
+RF with n=8,randomforest,validation,,8,0.8984253636492021,0.1982177024291476,0.2432930036822725,0.13849378649498428,0.40748898678414097,0.17341582302212225
+RF with n=8,randomforest,training,,8,0.9681600997975803,0.7928242014691683,0.8099069032144739,0.6805407958436651,0.9248234916559692,0.7203924754702831
+RF with n=7,randomforest,validation,,7,0.8943122440333287,0.2067518461768022,0.2572881776454534,0.1476366742596811,0.38027136046938026,0.19441319835020623
+RF with n=7,randomforest,training,,7,0.9714553029233159,0.8185400874106921,0.8340006159531874,0.7152667723190702,0.9216457419452427,0.7615774014124117
+RF with n=6,randomforest,validation,,6,0.8976133314503602,0.1908927851083515,0.2362391361601264,0.13394057040465882,0.3969026548672566,0.16816647919010125
+RF with n=6,randomforest,training,,6,0.9642705832509533,0.7641069559696295,0.783142857142857,0.6435783047663771,0.9137428119009917,0.6852071745515905
+RF with n=5,randomforest,validation,,5,0.8903756531563338,0.19815998244643085,0.25216763005780346,0.14427449359239355,0.3525252525252525,0.19628796400449944
+RF with n=5,randomforest,training,,5,0.9680129925151815,0.7960565163328178,0.813362631325963,0.6854348706672068,0.9024761904761904,0.740266233360415
+RF with n=4,randomforest,validation,,4,0.8965012003954244,0.1836755612831058,0.22966758638812243,0.12973133442184948,0.3838383838383838,0.16385451818522684
+RF with n=4,randomforest,training,,4,0.9583215647507414,0.7179006103155995,0.7397964806583154,0.5870452425373134,0.897415329768271,0.6292731704268483
+RF with n=3,randomforest,validation,,3,0.884832650755543,0.19748301573103,0.2567783094098884,0.1473010064043916,0.32723577235772355,0.21128608923884515
+RF with n=3,randomforest,training,,3,0.9618462552370193,0.7568865613248292,0.7775337953750086,0.6360370474319393,0.8620007607455307,0.7081432410474345
+RF with n=2,randomforest,validation,,2,0.8952301934754978,0.16171390872224467,0.2070808283233133,0.11549925484351714,0.3602975360297536,0.14529433820772403
+RF with n=2,randomforest,training,,2,0.9458115614555382,0.611605693610974,0.6388485822973451,0.4693442434020975,0.8575489576753,0.5090306855821511
+RF with n=1,randomforest,validation,,1,0.8623075836746222,0.17806462746230667,0.2538741151712264,0.1453927906212337,0.2591796875,0.2487814023247094
+RF with n=1,randomforest,training,,1,0.9401332203549404,0.6372556980238743,0.6701251540107646,0.5039009167154281,0.6963142645374301,0.6458346353352915
+RF with n=82,randomforest,validation,,82,0.9013027820929247,0.2115674759789925,0.2544339245232698,0.14576012223071047,0.44064665127020786,0.17885264341957255
+RF with n=82,randomforest,training,,82,0.9807289460057431,0.88078691240404,0.891315169415591,0.8039391762452107,0.9502547410133031,0.8392600462471096
+RF with n=79,randomforest,validation,,79,0.9009497246151673,0.21163471190323324,0.25494622228123753,0.14609648455334043,0.4369594902139281,0.17997750281214847
+RF with n=79,randomforest,training,,79,0.9807289460057431,0.8808080425023166,0.8913368061315903,0.8039743819955707,0.9500636582260574,0.8394475345290919
+RF with n=76,randomforest,validation,,76,0.9010379889846067,0.21116773113882015,0.25432295823357276,0.14568729046022555,0.43772893772893773,0.1792275965504312
+RF with n=76,randomforest,training,,76,0.9806818716753755,0.8806015383962613,0.8911580413088883,0.8036835496023441,0.9490185002118345,0.839947503281045
+RF with n=73,randomforest,validation,,73,0.9009850303629431,0.21227505948174463,0.2556071665560717,0.14653073645769932,0.43752839618355294,0.18053993250843645
+RF with n=73,randomforest,training,,73,0.9806995245492632,0.8806366484141405,0.8911817397651118,0.8037220992160852,0.9497913867477548,0.8393850384350978
+RF with n=70,randomforest,validation,,70,0.9011086004801582,0.21155047716210695,0.25465673230441727,0.1459063881689282,0.4385884509624198,0.17941507311586052
+RF with n=70,randomforest,training,,70,0.9806289130537118,0.8801787172086916,0.8907618794796921,0.8030393681943281,0.949557835160948,0.8388225735891507
+RF with n=67,randomforest,validation,,67,0.9008261544979522,0.2123578752633769,0.2558940397350994,0.14671931956257594,0.4359205776173285,0.18110236220472442
+RF with n=67,randomforest,training,,67,0.9806701030927835,0.8805217469529497,0.8910845131129604,0.8035639538360342,0.9490112994350283,0.8398225110930567
+RF with n=64,randomforest,validation,,64,0.9011968648495975,0.21144344194594633,0.25442919941388037,0.14575702075702077,0.4394845835250805,0.17904011998500188
+RF with n=64,randomforest,training,,64,0.9805465329755684,0.8796407023097087,0.8902681890600107,0.8022372435245558,0.9493169108798755,0.8381351165552153
+RF with n=61,randomforest,validation,,61,0.9008438073718401,0.2122291366394542,0.2557307539419637,0.14661197204497114,0.4360596475372797,0.1809148856392951
+RF with n=61,randomforest,training,,61,0.9805818387233441,0.880008065652908,0.8906198210142526,0.8028084852106364,0.9481967675912203,0.8396350228110743
+RF with n=58,randomforest,validation,,58,0.900720237254625,0.21043847813049743,0.2539135049084638,0.14541862938763106,0.4342105263157895,0.17941507311586052
+RF with n=58,randomforest,training,,58,0.9804170785670574,0.8789184801518768,0.8896185737976782,0.8011828663599976,0.947840836808255,0.8381351165552153
+RF with n=55,randomforest,validation,,55,0.9004201383985313,0.21191342477771857,0.2559029151826936,0.1467251550446226,0.43168669336893634,0.1818522684664417
+RF with n=55,randomforest,training,,55,0.9804935743539048,0.8794875454431075,0.890148126056268,0.802042278753135,0.947446388261851,0.8393850384350978
+RF with n=52,randomforest,validation,,52,0.9004377912724192,0.2087332173628288,0.25238600212089074,0.14441747572815533,0.4307692307692308,0.1784776902887139
+RF with n=52,randomforest,training,,52,0.9803288141976181,0.8784050580370714,0.8891541496733978,0.8004298250850695,0.9470264161604747,0.8379476282732329
+RF with n=49,randomforest,validation,,49,0.9000141222991103,0.20950692337065846,0.2537549407114625,0.1453146220009054,0.42686170212765956,0.18053993250843645
+RF with n=49,randomforest,training,,49,0.9803935414018735,0.8788731070622892,0.8895884419113262,0.801133989853775,0.9468152641602596,0.8388850696831448
+RF with n=46,randomforest,validation,,46,0.9004730970201948,0.20738915499176147,0.2508636726016476,0.14342145244606502,0.4306569343065693,0.17697787776527935
+RF with n=46,randomforest,training,,46,0.9800875582544838,0.8769319320838189,0.8878132873624188,0.7982592106832002,0.9454211678316741,0.8368226985813386
+RF with n=43,randomforest,validation,,43,0.900049428046886,0.20888987040553475,0.25303430079155675,0.14484216885666817,0.4269813000890472,0.17979002624671916
+RF with n=43,randomforest,training,,43,0.9802228969542909,0.877872935603419,0.8886828072732091,0.79966620969184,0.9453213077790305,0.8384475970251859
+RF with n=40,randomforest,validation,,40,0.9003142211552041,0.20729980516834423,0.2509616660034487,0.14348551494008796,0.4290249433106576,0.177352830896138
+RF with n=40,randomforest,training,,40,0.9798109965635738,0.8751231239365156,0.886153233566712,0.7955791229742613,0.9446095076400679,0.834510343103556
+RF with n=37,randomforest,validation,,37,0.8999611636774467,0.2088193514754081,0.2530644523527086,0.1448619284744228,0.42609853528628494,0.17997750281214847
+RF with n=37,randomforest,training,,37,0.9798580708939415,0.8753848451988319,0.8863885293239071,0.795958512160229,0.9451443941109853,0.834510343103556
+RF with n=34,randomforest,validation,,34,0.9005966671374099,0.20828289889567653,0.2516943521594684,0.14396473092125267,0.43222272934732997,0.1775403074615673
+RF with n=34,randomforest,training,,34,0.9792343360165702,0.8712519666687056,0.8825897461489837,0.78985291490502,0.9436539556061468,0.8289481907380789
+RF with n=31,randomforest,validation,,31,0.8998905521818952,0.20951499328149714,0.25391395868964606,0.1454189270644967,0.42567269519188355,0.1809148856392951
+RF with n=31,randomforest,training,,31,0.9795050134161841,0.8732604525444829,0.8844584508210316,0.7928511954323778,0.9425197963800905,0.8331354290356853
+RF with n=28,randomforest,validation,,28,0.9002612625335404,0.20660748177990318,0.2502653927813163,0.14303048688002426,0.4282470481380563,0.17679040119985
+RF with n=28,randomforest,training,,28,0.978445840982912,0.8661174550714537,0.877879646607768,0.7823400083189732,0.9408317850507361,0.8228235735266546
+RF with n=25,randomforest,validation,,25,0.8994668832085864,0.20849794092911,0.2533106070538875,0.1450232697793124,0.42128216310510247,0.18110236220472442
+RF with n=25,randomforest,training,,25,0.9787518241303017,0.8681275978513283,0.8797255437497918,0.7852768032348219,0.9418057338468121,0.8253234172864196
+RF with n=22,randomforest,validation,,22,0.8997669820646801,0.20358337108162272,0.24754836999734958,0.1412583182093164,0.42224231464737794,0.17510311211098611
+RF with n=22,randomforest,training,,22,0.9771277597326178,0.8570737343362846,0.8695331118047864,0.7691805225653207,0.9391676334106729,0.8095119055059059
+RF with n=19,randomforest,validation,,19,0.8987784211269595,0.20710270400888275,0.25260688216892596,0.14456213635685514,0.4144568006843456,0.18166479190101237
+RF with n=19,randomforest,training,,19,0.9776749988231418,0.8611522817644859,0.8733306623931624,0.7751437207372726,0.9374955200344062,0.8173864133491657
+RF with n=16,randomforest,validation,,16,0.8997669820646801,0.2023119469792498,0.24614976101964953,0.14034822104466313,0.42174704276615105,0.17379077615298089
+RF with n=16,randomforest,training,,16,0.975074142070329,0.8428667723954619,0.856406779661017,0.7488736068294997,0.9357730202237203,0.7894506593337917
+RF with n=13,randomforest,validation,,13,0.897754554441463,0.21216940863211176,0.25933503836317134,0.14898618865706728,0.40788415124698313,0.19010123734533182
+RF with n=13,randomforest,training,,13,0.9759509014734266,0.8493780118298337,0.8624692936702897,0.7581942965329547,0.934310294546515,0.8008874445347166
+RF with n=10,randomforest,validation,,10,0.8984783222708657,0.19963149051501017,0.24478003939592907,0.13945832709860842,0.4085927224901359,0.17472815898012747
+RF with n=10,randomforest,training,,10,0.9707550722590971,0.8118897502731892,0.8276579513142381,0.7059867486985328,0.9296564617901378,0.7458283857258922
+knn with k=10,knn,validation,10,,0.9056100833215648,0.11843110704210813,0.14242181234963913,0.07667069590744258,0.49278579356270813,0.08323959505061868
+knn with k=10,knn,training,10,,0.911770936308431,0.17980595240248232,0.2028708133971292,0.11288604898828541,0.6792452830188679,0.1192425473407912
+knn with k=9,knn,validation,9,,0.9024149131478605,0.14392311498708998,0.17713605239654662,0.09717458762044749,0.42991329479768786,0.1115485564304462
+knn with k=9,knn,training,9,,0.9124711669726498,0.23511781997151382,0.26532325776658267,0.15295256534365925,0.6325953838907207,0.16786450846822074
+knn with k=8,knn,validation,8,,0.9050098856093772,0.12291958652377599,0.14871064704951747,0.0803281490343531,0.47619047619047616,0.08811398575178103
+knn with k=8,knn,training,8,,0.9130007531892859,0.20099457196483805,0.22530783337699764,0.12695600826690287,0.6971465629053177,0.13436660208736953
+knn with k=7,knn,validation,7,,0.9013910464623641,0.16252173730589292,0.1997134670487106,0.11093426706987108,0.42345078979343864,0.13067116610423696
+knn with k=7,knn,training,7,,0.9148189991997364,0.28064482840384564,0.3132827324478179,0.18573517831027112,0.650127977948415,0.20636210236860197
+knn with k=6,knn,validation,6,,0.9046744810055077,0.1336908340161117,0.16175100900341507,0.08799189326127344,0.47021660649819497,0.09767529058867641
+knn with k=6,knn,training,6,,0.9153309325424845,0.2347480248013466,0.2603197450264741,0.14963654630340997,0.7334878331402086,0.15824010999312543
+knn with k=5,knn,validation,5,,0.8978251659370146,0.18012698693012708,0.22412868632707778,0.12620772946859904,0.3932267168391345,0.15673040869891264
+knn with k=5,knn,training,5,,0.91837899543379,0.3483734131820063,0.3838670990094612,0.23752198768689534,0.663544226044226,0.2700456221486157
+knn with k=4,knn,validation,4,,0.9035800028244598,0.14206583483718627,0.17317590069633665,0.09479615512098111,0.449685534591195,0.1072365954255718
+knn with k=4,knn,training,4,,0.9191321847196724,0.28593474765358184,0.3128156407820391,0.18540691126785608,0.7823911955977989,0.19548778201362416
+knn with k=3,knn,validation,3,,0.891434825589606,0.20256423022873615,0.2558083252662149,0.1466629665602886,0.3607508532423208,0.19816272965879264
+knn with k=3,knn,training,3,,0.9274761097773384,0.4716495995953863,0.507531865585168,0.34006211180124224,0.7036339463771327,0.3969126929566902
+knn with k=2,knn,validation,2,,0.9016028809490185,0.14624324691814083,0.1810167499265354,0.09951534733441034,0.41847826086956524,0.11548556430446194
+knn with k=2,knn,training,2,,0.9286765052017135,0.3799908872388611,0.405045894075492,0.2539545762294578,0.9437328453796889,0.25785888381976124
+knn with k=1,knn,validation,1,,0.8622546250529586,0.19408522052439148,0.2701337573660088,0.15615875419054828,0.26955385476946053,0.27071616047994
+knn with k=1,knn,training,1,,0.976168620251377,0.8593142653058983,0.8724570132896643,0.7737682940453581,0.8793245730971878,0.8656958940066246
diff --git a/docs/results/manual_gridsearch_validation_only.csv b/docs/results/manual_gridsearch_validation_only.csv
new file mode 100644
index 00000000..13bbdfb3
--- /dev/null
+++ b/docs/results/manual_gridsearch_validation_only.csv
@@ -0,0 +1,45 @@
+Name,classifier,dataset,k,n,Accuracy,Cohen_kappa,F1-Score,Jaccard,Precision,Recall
+knn with k=1,knn,validation,1,,0.8622546250529586,0.19408522052439148,0.2701337573660088,0.15615875419054828,0.26955385476946053,0.27071616047994
+RF with n=9,randomforest,validation,,9,0.8957597796921338,0.2118527561386111,0.26122857500312774,0.1502374442365808,0.39262880782248966,0.19572553430821146
+RF with n=13,randomforest,validation,,13,0.897754554441463,0.21216940863211176,0.25933503836317134,0.14898618865706728,0.40788415124698313,0.19010123734533182
+RF with n=7,randomforest,validation,,7,0.8943122440333287,0.2067518461768022,0.2572881776454534,0.1476366742596811,0.38027136046938026,0.19441319835020623
+RF with n=3,randomforest,validation,,3,0.884832650755543,0.19748301573103,0.2567783094098884,0.1473010064043916,0.32723577235772355,0.21128608923884515
+RF with n=55,randomforest,validation,,55,0.9004201383985313,0.21191342477771857,0.2559029151826936,0.1467251550446226,0.43168669336893634,0.1818522684664417
+RF with n=67,randomforest,validation,,67,0.9008261544979522,0.2123578752633769,0.2558940397350994,0.14671931956257594,0.4359205776173285,0.18110236220472442
+knn with k=3,knn,validation,3,,0.891434825589606,0.20256423022873615,0.2558083252662149,0.1466629665602886,0.3607508532423208,0.19816272965879264
+RF with n=61,randomforest,validation,,61,0.9008438073718401,0.2122291366394542,0.2557307539419637,0.14661197204497114,0.4360596475372797,0.1809148856392951
+RF with n=73,randomforest,validation,,73,0.9009850303629431,0.21227505948174463,0.2556071665560717,0.14653073645769932,0.43752839618355294,0.18053993250843645
+RF with n=79,randomforest,validation,,79,0.9009497246151673,0.21163471190323324,0.25494622228123753,0.14609648455334043,0.4369594902139281,0.17997750281214847
+RF with n=70,randomforest,validation,,70,0.9011086004801582,0.21155047716210695,0.25465673230441727,0.1459063881689282,0.4385884509624198,0.17941507311586052
+RF with n=82,randomforest,validation,,82,0.9013027820929247,0.2115674759789925,0.2544339245232698,0.14576012223071047,0.44064665127020786,0.17885264341957255
+RF with n=64,randomforest,validation,,64,0.9011968648495975,0.21144344194594633,0.25442919941388037,0.14575702075702077,0.4394845835250805,0.17904011998500188
+RF with n=76,randomforest,validation,,76,0.9010379889846067,0.21116773113882015,0.25432295823357276,0.14568729046022555,0.43772893772893773,0.1792275965504312
+RF with n=31,randomforest,validation,,31,0.8998905521818952,0.20951499328149714,0.25391395868964606,0.1454189270644967,0.42567269519188355,0.1809148856392951
+RF with n=58,randomforest,validation,,58,0.900720237254625,0.21043847813049743,0.2539135049084638,0.14541862938763106,0.4342105263157895,0.17941507311586052
+RF with n=1,randomforest,validation,,1,0.8623075836746222,0.17806462746230667,0.2538741151712264,0.1453927906212337,0.2591796875,0.2487814023247094
+RF with n=49,randomforest,validation,,49,0.9000141222991103,0.20950692337065846,0.2537549407114625,0.1453146220009054,0.42686170212765956,0.18053993250843645
+RF with n=25,randomforest,validation,,25,0.8994668832085864,0.20849794092911,0.2533106070538875,0.1450232697793124,0.42128216310510247,0.18110236220472442
+RF with n=37,randomforest,validation,,37,0.8999611636774467,0.2088193514754081,0.2530644523527086,0.1448619284744228,0.42609853528628494,0.17997750281214847
+RF with n=43,randomforest,validation,,43,0.900049428046886,0.20888987040553475,0.25303430079155675,0.14484216885666817,0.4269813000890472,0.17979002624671916
+RF with n=19,randomforest,validation,,19,0.8987784211269595,0.20710270400888275,0.25260688216892596,0.14456213635685514,0.4144568006843456,0.18166479190101237
+RF with n=52,randomforest,validation,,52,0.9004377912724192,0.2087332173628288,0.25238600212089074,0.14441747572815533,0.4307692307692308,0.1784776902887139
+RF with n=5,randomforest,validation,,5,0.8903756531563338,0.19815998244643085,0.25216763005780346,0.14427449359239355,0.3525252525252525,0.19628796400449944
+RF with n=34,randomforest,validation,,34,0.9005966671374099,0.20828289889567653,0.2516943521594684,0.14396473092125267,0.43222272934732997,0.1775403074615673
+RF with n=40,randomforest,validation,,40,0.9003142211552041,0.20729980516834423,0.2509616660034487,0.14348551494008796,0.4290249433106576,0.177352830896138
+RF with n=46,randomforest,validation,,46,0.9004730970201948,0.20738915499176147,0.2508636726016476,0.14342145244606502,0.4306569343065693,0.17697787776527935
+RF with n=28,randomforest,validation,,28,0.9002612625335404,0.20660748177990318,0.2502653927813163,0.14303048688002426,0.4282470481380563,0.17679040119985
+RF with n=22,randomforest,validation,,22,0.8997669820646801,0.20358337108162272,0.24754836999734958,0.1412583182093164,0.42224231464737794,0.17510311211098611
+RF with n=16,randomforest,validation,,16,0.8997669820646801,0.2023119469792498,0.24614976101964953,0.14034822104466313,0.42174704276615105,0.17379077615298089
+RF with n=10,randomforest,validation,,10,0.8984783222708657,0.19963149051501017,0.24478003939592907,0.13945832709860842,0.4085927224901359,0.17472815898012747
+RF with n=8,randomforest,validation,,8,0.8984253636492021,0.1982177024291476,0.2432930036822725,0.13849378649498428,0.40748898678414097,0.17341582302212225
+RF with n=6,randomforest,validation,,6,0.8976133314503602,0.1908927851083515,0.2362391361601264,0.13394057040465882,0.3969026548672566,0.16816647919010125
+RF with n=4,randomforest,validation,,4,0.8965012003954244,0.1836755612831058,0.22966758638812243,0.12973133442184948,0.3838383838383838,0.16385451818522684
+knn with k=5,knn,validation,5,,0.8978251659370146,0.18012698693012708,0.22412868632707778,0.12620772946859904,0.3932267168391345,0.15673040869891264
+RF with n=2,randomforest,validation,,2,0.8952301934754978,0.16171390872224467,0.2070808283233133,0.11549925484351714,0.3602975360297536,0.14529433820772403
+knn with k=7,knn,validation,7,,0.9013910464623641,0.16252173730589292,0.1997134670487106,0.11093426706987108,0.42345078979343864,0.13067116610423696
+knn with k=2,knn,validation,2,,0.9016028809490185,0.14624324691814083,0.1810167499265354,0.09951534733441034,0.41847826086956524,0.11548556430446194
+knn with k=9,knn,validation,9,,0.9024149131478605,0.14392311498708998,0.17713605239654662,0.09717458762044749,0.42991329479768786,0.1115485564304462
+knn with k=4,knn,validation,4,,0.9035800028244598,0.14206583483718627,0.17317590069633665,0.09479615512098111,0.449685534591195,0.1072365954255718
+knn with k=6,knn,validation,6,,0.9046744810055077,0.1336908340161117,0.16175100900341507,0.08799189326127344,0.47021660649819497,0.09767529058867641
+knn with k=8,knn,validation,8,,0.9050098856093772,0.12291958652377599,0.14871064704951747,0.0803281490343531,0.47619047619047616,0.08811398575178103
+knn with k=10,knn,validation,10,,0.9056100833215648,0.11843110704210813,0.14242181234963913,0.07667069590744258,0.49278579356270813,0.08323959505061868
diff --git a/code/__init__.py b/src/__init__.py
similarity index 100%
rename from code/__init__.py
rename to src/__init__.py
diff --git a/src/application.sh b/src/application.sh
new file mode 100755
index 00000000..789a114a
--- /dev/null
+++ b/src/application.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+# execute the application with all necessary pickle files
+echo "Starting the application..."
+python -m src.application.application data/preprocessing/pipeline.pickle data/feature_extraction/pipeline.pickle data/dimensionality_reduction/pipeline.pickle data/classification/classifier.pickle
\ No newline at end of file
diff --git a/code/application/__init__.py b/src/application/__init__.py
similarity index 100%
rename from code/application/__init__.py
rename to src/application/__init__.py
diff --git a/src/application/application.py b/src/application/application.py
new file mode 100644
index 00000000..ebaae0bd
--- /dev/null
+++ b/src/application/application.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Console-based application for tweet classification.
+"""
+
+import argparse, pickle
+import pandas as pd
+from sklearn.pipeline import make_pipeline
+from src.util import COLUMN_TWEET
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Application")
+    parser.add_argument("preprocessing_file", help = "path to the pickle file containing the preprocessing")
+    parser.add_argument("feature_file", help = "path to the pickle file containing the feature extraction")
+    parser.add_argument("dim_red_file", help = "path to the pickle file containing the dimensionality reduction")
+    parser.add_argument("classifier_file", help = "path to the pickle file containing the classifier")
+    args = parser.parse_args()
+
+    # load all the pipeline steps
+    with open(args.preprocessing_file, 'rb') as f_in:
+        preprocessing = pickle.load(f_in)
+    with open(args.feature_file, 'rb') as f_in:
+        feature_extraction = pickle.load(f_in)
+    with open(args.dim_red_file, 'rb') as f_in:
+        dimensionality_reduction = pickle.load(f_in)
+    with open(args.classifier_file, 'rb') as f_in:
+        classifier = pickle.load(f_in)["classifier"]
+
+    # chain them together into a single pipeline
+    pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier)
+
+    # headline output
+    print("Welcome to ViralTweeter v0.1!")
+    print("-----------------------------")
+    print("")
+
+    while True:
+        # ask user for input
+        tweet = input("Please type in your tweet (type 'quit' to quit the program): ")
+        
+        # terminate if necessary
+        if tweet == "quit":
+            print("Okay, goodbye!")
+            break
+        
+        # if not terminated: create pandas DataFrame and put it through the pipeline
+        df = pd.DataFrame()
+        df[COLUMN_TWEET] = [tweet]
+        
+        prediction = pipeline.predict(df)
+        confidence = pipeline.predict_proba(df)
+        
+        print("Prediction: {0}, Confidence: {1}".format(prediction, confidence))
+        print("")
+        
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/classification.sh b/src/classification.sh
new file mode 100755
index 00000000..48a8fe03
--- /dev/null
+++ b/src/classification.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/classification/
+
+
+#-------------------------------------------------------------------#
+# UNCOMMENT ONE OF THESE LINES BELOW TO TRAIN A SPECIFIC CLASSIFIER #
+#-------------------------------------------------------------------#
+
+RUN_NAME="2021-11-12 best params"
+
+# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --knn 1 --metrics all -n "${RUN_NAME}"
+python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --randomforest 9 --metrics all -n "${RUN_NAME}"
+# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --randomforest 10 --metrics all -n "${RUN_NAME}" --sk_gridsearch_rf
+# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier stratified --metrics all -n "${RUN_NAME}"
+# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier most_frequent --metrics all -n "${RUN_NAME}"
+
+# run classification on validation set (with pre-fit classifier)
+python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --metrics all -n "${RUN_NAME}"
+
+# don't touch the test set, yet, because that would ruin the final generalization experiment!
\ No newline at end of file
diff --git a/code/classification/__init__.py b/src/classification/__init__.py
similarity index 100%
rename from code/classification/__init__.py
rename to src/classification/__init__.py
diff --git a/code/classification/classifier.sge b/src/classification/classifier.sge
similarity index 63%
rename from code/classification/classifier.sge
rename to src/classification/classifier.sge
index 5b03d664..80d44491 100755
--- a/code/classification/classifier.sge
+++ b/src/classification/classifier.sge
@@ -13,10 +13,10 @@ conda activate MLinPractice
 
 # train classifier on training set
 echo "    training"
-python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e $*
+python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e $*
 
 # evaluate classifier on validation set
 echo "    validation"
-python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $*
+python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $*
 
 conda deactivate
\ No newline at end of file
diff --git a/code/classification/grid_search.sh b/src/classification/grid_search.sh
similarity index 82%
rename from code/classification/grid_search.sh
rename to src/classification/grid_search.sh
index 6897508f..59700720 100755
--- a/code/classification/grid_search.sh
+++ b/src/classification/grid_search.sh
@@ -10,11 +10,11 @@ values_of_k=("1 2 3 4 5 6 7 8 9 10")
 if [ $1 = local ]
 then
     echo "[local execution]"
-    cmd="code/classification/classifier.sge"
+    cmd="src/classification/classifier.sge"
 elif [ $1 = grid ]
 then
     echo "[grid execution]"
-    cmd="qsub code/classification/classifier.sge"
+    cmd="qsub src/classification/classifier.sge"
 else
     echo "[ERROR! Argument not supported!]"
     exit 1
diff --git a/src/classification/run_classifier.py b/src/classification/run_classifier.py
new file mode 100644
index 00000000..cbfe556f
--- /dev/null
+++ b/src/classification/run_classifier.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Train or evaluate a single classifier with its given set of hyperparameters.
+"""
+
+import argparse, pickle
+from pathlib import Path
+from typing import Any, Callable, List, Tuple
+from sklearn.dummy import DummyClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import make_pipeline
+import mlflow
+from mlflow import log_metric, log_param, set_tracking_uri, start_run
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, cohen_kappa_score, precision_score
+from sklearn.metrics import recall_score, f1_score, jaccard_score, make_scorer
+import pandas as pd
+import numpy as np
+
+METR_ACC = "accuracy"
+METR_KAPPA = "kappa"
+METR_PREC = "precision"
+METR_REC = "recall"
+METR_F1 = "f1"
+METR_JAC = "jaccard"
+
+def main():
+
+    args = parse_arguments()
+
+    # load data
+    with open(args.input_file, 'rb') as f_in:
+        data = pickle.load(f_in)
+
+    set_tracking_uri(args.log_folder)
+    with mlflow.start_run(run_name = args.run_name):
+
+        if args.import_file is not None:
+            # import a pre-trained classifier
+            with open(args.import_file, 'rb') as f_in:
+                input_dict = pickle.load(f_in)
+            
+            classifier = input_dict["classifier"]
+            for param, value in input_dict["params"].items():
+                log_param(param, value)
+            
+            log_param("dataset", "validation")
+
+        else:   
+            # manually set up a classifier        
+            if args.dummyclassifier == "most_frequent":
+                # majority vote classifier
+                print("    always most_frequent label (Dummy Classifier)")
+                log_param("classifier", "most_frequent")
+                params = {"classifier": "most_frequent"}
+                classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed)
+
+            elif args.dummyclassifier == "stratified":
+                # label frequency classifier
+                print("    label frequency classifier")
+                log_param("classifier", "stratified")
+                params = {"classifier": "stratified"}
+                classifier = DummyClassifier(strategy = "stratified", random_state = args.seed)
+                
+            elif args.knn is not None:
+                print("    {0} nearest neighbor classifier".format(args.knn))
+                log_param("classifier", "knn")
+                log_param("k", args.knn)
+                params = {"classifier": "knn", "k": args.knn}
+                standardizer = StandardScaler()
+                knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1)
+                classifier = make_pipeline(standardizer, knn_classifier)
+
+            elif args.randomforest is not None:
+
+                if args.sk_gridsearch_rf is None:
+                    print("    random forest classifier")
+                    log_param("classifier", "randomforest")
+                    log_param("n", args.randomforest)
+                    params = {"classifier": "randomforest", "n": args.randomforest}
+                    # classifier = RandomForestClassifier(n_estimators = args.randomforest, random_state = args.seed)                   
+                                        
+                    classifier = RandomForestClassifier(
+                        criterion= 'entropy',
+                        n_estimators = args.randomforest,
+                        min_samples_split=2,
+                        random_state = args.seed)
+
+                else:
+                    print("    grid search for random forest classifier")
+                    estim_range = np.arange(5, 130, 12).tolist()
+                    parameters = {
+                        'n_estimators': estim_range,
+                        'min_samples_split': [2,4,6,8]
+                    }
+                    scoring = {
+                        'cohen_kappa': make_scorer(cohen_kappa_score),
+                        'rec': 'recall',
+                        'prec': 'precision'
+                    }
+                    classifier = GridSearchCV(RandomForestClassifier(), parameters, scoring = scoring, refit="cohen_kappa")
+
+            classifier.fit(data["features"], data["labels"].ravel())
+            log_param("dataset", "training")
+
+            if args.randomforest is not None and args.sk_gridsearch_rf is not None:
+                results_df = sum_ranks_of_cv_results(classifier.cv_results_)
+                save_results_as_csv(results_df)
+
+        prediction = classifier.predict(data["features"])
+        
+        evaluation_metrics = select_metrics_based_on_args(args.metrics)
+        computed_metrics = compute_metrics(
+            evaluation_metrics, data["labels"], prediction)
+        
+        print_input_file_name(args.input_file) # eg training set
+        print_formatted_metrics(computed_metrics) # eg Accuracy: 0.908
+        log_metrics(computed_metrics)
+        # export the trained classifier if the user wants us to do so
+        if args.export_file is not None and args.sk_gridsearch_rf is None:
+            output_dict = {"classifier": classifier, "params": params}
+            with open(args.export_file, 'wb') as f_out:
+                pickle.dump(output_dict, f_out)
+
+
+def sum_ranks_of_cv_results(cv_results_):
+
+    results_df = pd.DataFrame(cv_results_)
+    results_df["rank_sum"] = (
+        results_df["rank_test_cohen_kappa"] 
+        + results_df["rank_test_rec"] 
+        + results_df["rank_test_prec"])
+
+    results_df.sort_values(by=['rank_sum'], inplace=True)
+
+    return results_df
+
+
+def save_results_as_csv(results_df):
+    drop_cols = [
+        "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time", 
+        "params",
+        "split0_test_cohen_kappa", "split1_test_cohen_kappa", "split2_test_cohen_kappa", 
+        "split3_test_cohen_kappa", "split4_test_cohen_kappa", "std_test_cohen_kappa", 
+        "split0_test_rec", "split1_test_rec", "split2_test_rec", "split3_test_rec", 
+        "split4_test_rec", "std_test_rec", 
+        "split0_test_prec", "split1_test_prec", "split2_test_prec",
+        "split3_test_prec", "split4_test_prec", "std_test_prec"
+    ]
+    results_df.drop(columns=drop_cols, inplace=True)
+    results_df.to_csv("data/gridsearch_results.csv", encoding="utf-8")
+
+
+def print_input_file_name(input_file):
+    print("      " + Path(input_file).stem + " set");        
+
+
+def select_metrics_based_on_args(metrics: str):
+    evaluation_metrics: List[Tuple[str, Callable[[Any, Any], float] ]] = []
+
+    if metrics == METR_ACC or metrics == "all":
+        evaluation_metrics.append(("Accuracy", accuracy_score))
+
+    if metrics == METR_KAPPA or metrics == "all":
+        evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score))
+
+    if metrics == METR_PREC or metrics == "all":
+        evaluation_metrics.append(("Precision", precision_score))
+
+    if metrics == METR_REC or metrics == "all":
+        evaluation_metrics.append(("Recall", recall_score))
+
+    if metrics == METR_F1 or metrics == "all":
+        evaluation_metrics.append(("F1-Score", f1_score))
+
+    if metrics == METR_JAC or metrics == "all":
+        evaluation_metrics.append(("Jaccard", jaccard_score))
+
+    return evaluation_metrics
+
+
+def compute_metrics(evaluation_metrics, data_column, prediction):
+    computed_metrics: List[Tuple[str, float]] = []
+
+    for metric_name, metric in evaluation_metrics:
+        metric_score = metric(data_column, prediction)
+        computed_metrics.append((metric_name, metric_score))
+
+    return computed_metrics
+
+
+def print_formatted_metrics(computed_metrics):
+    for metric_name, metric_score in computed_metrics:
+        number_of_decimals = 3
+        rounded_score = round(metric_score, number_of_decimals)
+        print(f"\t{metric_name}: {rounded_score}")
+
+
+def log_metrics(computed_metrics):
+    for metric_name, metric_score in computed_metrics:
+        log_metric(metric_name, metric_score)
+
+
+def parse_arguments():
+    """
+    parses the passed command line arguments to decide 
+    which specific functionalities should be executed
+    in this script
+    """
+ 
+    ap = argparse.ArgumentParser(description="Classifier")
+
+    ap.add_argument(
+        "input_file", help="path to the input pickle file")
+
+    seed_msg = "seed for the random number generator"
+    ap.add_argument(
+        "-s", '--seed', type=int, help=seed_msg, default=None)
+
+    export_msg = "export the trained classifier to the given location"
+    ap.add_argument(
+        "-e", "--export_file", help=export_msg, default=None)
+
+    import_msg = "import a trained classifier from the given location"
+    ap.add_argument(
+        "-i", "--import_file", help=import_msg, default=None)
+
+    ap.add_argument(
+        "-d", '--dummyclassifier', choices=["most_frequent", "stratified"], default=None)
+
+    knn_msg = "k nearest neighbor classifier with the specified value of k"
+    ap.add_argument(
+        "--knn", type=int, help=knn_msg, default=None)
+
+    rf_msg = "Random Forest classifier with the specified number of estimators (trees)"
+    ap.add_argument(
+        "-r", "--randomforest", type=int, help=rf_msg, default=None)
+
+    metric_msg = "Choose `none`, `all` or a specific metric for evaluation"
+    metrics_choices = ["none", "all", METR_ACC,
+                       METR_KAPPA, METR_PREC, METR_REC, METR_F1, METR_JAC]
+    ap.add_argument(
+        "-m", "--metrics", choices=metrics_choices, help=metric_msg,  default=METR_KAPPA)
+
+    grid_msg = "Perform grid search on RandomForestClassifier. Param range is predifined!"
+    ap.add_argument(
+        "--sk_gridsearch_rf", action="store_true", help=grid_msg, default=None)
+
+    log_msg = "where to log the mlflow results"
+    default_path = "data/classification/mlflow"
+    ap.add_argument(
+        "--log_folder", help=log_msg, default=default_path)
+
+    runname_msg = "sets the name of the run for logging purposes"
+    ap.add_argument(
+        "-n", "--run_name", help=runname_msg, default="")
+
+    args = ap.parse_args()
+
+    return args
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/classification/test/__init__.py b/src/classification/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/classification/test/metrics_test.py b/src/classification/test/metrics_test.py
new file mode 100644
index 00000000..0c7bf502
--- /dev/null
+++ b/src/classification/test/metrics_test.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Tests the calculation of metrics
+"""
+
+import unittest
+import csv
+import pandas as pd
+from src.classification.run_classifier import compute_metrics, select_metrics_based_on_args
+from src.util import fm
+import os
+
+
+class MetricsTest(unittest.TestCase):
+
+    def setUp(self):
+        dir_path = os.path.dirname(os.path.realpath(__file__))
+
+        # loads a test data set that replicates the quantities
+        # from the VIPs questions about evaluation metrics       
+        # 
+        # |                        | Actual class: True | Actual class: False |
+        # |------------------------|--------------------|---------------------|
+        # | predicted class: True  | 23 (TP)            | 38 (FP)             |
+        # | predicted class: False | 13 (FN)            | 682 (TN)            |
+
+        self.df = pd.read_csv(dir_path + "/metrics_test_data.csv",
+                              quoting=csv.QUOTE_NONNUMERIC,
+                              lineterminator="\n",
+                              dtype={"label": bool, "prediction": bool},
+                              verbose=True)
+
+    def test_compute_metrics(self):
+        evaluation_metrics = select_metrics_based_on_args("all")
+
+        labels = self.df["actual"]
+        prediction = self.df["prediction"]
+        computed_metrics = compute_metrics(
+            evaluation_metrics, labels, prediction)
+
+        computed_metrics_dict = dict(computed_metrics)
+
+        DECIMAL_PLACES = 4
+
+        msg = fm("a specific data set",
+                 "calculate exact value for accuracy")
+        score = round(computed_metrics_dict["Accuracy"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.9325, msg)
+
+        msg = fm("a specific data set",
+                 "calculate exact value for Cohen_kappa")
+        score = round(computed_metrics_dict["Cohen_kappa"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.4407, msg)
+
+        msg = fm("a specific data set",
+                 "calculate exact value for Precision")
+        score = round(computed_metrics_dict["Precision"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.3770, msg)
+
+        msg = fm("a specific data set",
+            "calculate exact value for Recall")
+        score = round(computed_metrics_dict["Recall"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.6389, msg)
+
+        msg = fm("a specific data set",
+            "calculate exact value for F1-Score")
+        score = round(computed_metrics_dict["F1-Score"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.4742, msg)
+
+        msg = fm("a specific data set",
+            "calculate exact value for Jaccard")
+        score = round(computed_metrics_dict["Jaccard"], DECIMAL_PLACES)
+        self.assertEqual(score, 0.3108, msg)
diff --git a/src/classification/test/metrics_test_data.csv b/src/classification/test/metrics_test_data.csv
new file mode 100644
index 00000000..53cf2c6f
--- /dev/null
+++ b/src/classification/test/metrics_test_data.csv
@@ -0,0 +1,757 @@
+actual,prediction
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,True
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+True,False
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,True
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
+False,False
\ No newline at end of file
diff --git a/src/classification_hyper_param.sh b/src/classification_hyper_param.sh
new file mode 100644
index 00000000..1a8bbbb6
--- /dev/null
+++ b/src/classification_hyper_param.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# perform grid search over the range of parameters
+
+# k nearest neighbour classifier
+values_of_k=("1 2 3 4 5 6 7 8 9 10")
+for k in $values_of_k
+do
+    echo $k
+    RUN_NAME="knn with k=${k}"
+
+    python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --knn $k --metrics all -n "${RUN_NAME}"    
+    python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --metrics all -n "${RUN_NAME}"
+done
+
+# random forest classifier
+values_of_n=("10 13 16 19 22 25 28 31 34 37 40 43 46 49 52 55 58 61 64 67 70 73 76 79 82")
+for n in $values_of_n
+do
+    echo $n
+    RUN_NAME="RF with n=${n}"
+
+    python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --randomforest $n --metrics all -n "${RUN_NAME}"
+    python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --metrics all -n "${RUN_NAME}"
+done
diff --git a/code/create_environment.sge b/src/create_environment.sge
similarity index 100%
rename from code/create_environment.sge
rename to src/create_environment.sge
diff --git a/src/dimensionality_reduction.sh b/src/dimensionality_reduction.sh
new file mode 100755
index 00000000..a2178236
--- /dev/null
+++ b/src/dimensionality_reduction.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/dimensionality_reduction/
+
+# run dimensionality reduction on training set to fit the parameters
+echo "  training set"
+python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/training.pickle data/dimensionality_reduction/training.pickle -e data/dimensionality_reduction/pipeline.pickle --verbose
+
+# run dimensionality reduction on validation set and test set (with pre-fit parameters)
+echo "  validation set"
+python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/validation.pickle data/dimensionality_reduction/validation.pickle -i data/dimensionality_reduction/pipeline.pickle
+echo "  test set"
+python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/test.pickle data/dimensionality_reduction/test.pickle -i data/dimensionality_reduction/pipeline.pickle
diff --git a/code/dimensionality_reduction/__init__.py b/src/dimensionality_reduction/__init__.py
similarity index 100%
rename from code/dimensionality_reduction/__init__.py
rename to src/dimensionality_reduction/__init__.py
diff --git a/src/dimensionality_reduction/reduce_dimensionality.py b/src/dimensionality_reduction/reduce_dimensionality.py
new file mode 100644
index 00000000..760606aa
--- /dev/null
+++ b/src/dimensionality_reduction/reduce_dimensionality.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Apply a dimensionality reduction technique.
+"""
+
+import argparse, pickle
+from sklearn.feature_selection import SelectKBest, mutual_info_classif
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Dimensionality reduction")
+    parser.add_argument("input_file", help = "path to the input pickle file")
+    parser.add_argument("output_file", help = "path to the output pickle file")
+    parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
+    parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
+    parser.add_argument("-m", "--mutual_information", type = int, help = "select K best features with Mutual Information", default = None)
+    parser.add_argument("--verbose", action = "store_true", help = "print information about feature selection process")
+    args = parser.parse_args()
+
+    # load the data
+    with open(args.input_file, 'rb') as f_in:
+        input_data = pickle.load(f_in)
+
+    features = input_data["features"]
+    labels = input_data["labels"]
+    feature_names = input_data["feature_names"]
+
+    if args.import_file is not None:
+        # simply import an already fitted dimensionality reducer
+        with open(args.import_file, 'rb') as f_in:
+            dim_red = pickle.load(f_in)
+
+    else: # need to set things up manually
+
+        # select K best based on Mutual Information
+        k_param = args.mutual_information or "all"
+        dim_red = SelectKBest(mutual_info_classif, k = k_param)
+        dim_red.fit(features, labels.ravel())
+        
+        # resulting feature names based on support given by SelectKBest
+        def get_feature_names(kbest, names):
+            support = kbest.get_support()
+            result = []
+            for name, selected in zip(names, support):
+                if selected:
+                    result.append(name)
+            return result
+        
+        if args.verbose:
+            print("    SelectKBest with Mutual Information and k = {0}".format(args.mutual_information))
+            print("    {0}".format(feature_names))
+            print("    " + str(dim_red.scores_))
+            print("    " + str(get_feature_names(dim_red, feature_names)))        
+
+    # apply the dimensionality reduction to the given features
+    reduced_features = dim_red.transform(features)
+
+    # print("reduced_features \n --- \n ", reduced_features)
+
+    # store the results
+    output_data = {"features": reduced_features, 
+                "labels": labels}
+    with open(args.output_file, 'wb') as f_out:
+        pickle.dump(output_data, f_out)
+
+    # export the dimensionality reduction technique as pickle file if desired by user
+    if args.export_file is not None:
+        with open(args.export_file, 'wb') as f_out:
+            pickle.dump(dim_red, f_out)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/examples/example_data_viz.py b/src/examples/example_data_viz.py
new file mode 100644
index 00000000..d9d6aae9
--- /dev/null
+++ b/src/examples/example_data_viz.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Example: exploratory data visualizuation with pandas and matplotlib
+"""
+
+# plotting with pandas
+import csv
+import pandas as pd
+
+df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
+
+df["language"].value_counts().plot(kind = 'bar')
+df["language"].value_counts().plot(kind = 'bar', logy = True)
+
+df["date"] = df["date"].astype("datetime64")
+df["label"].groupby(df["date"].dt.month).count().plot(kind = 'bar')
+
+
+# plotting with matplotlib
+import pickle
+from matplotlib import pyplot as plt
+import numpy as np
+
+with open("data/feature_extraction/training.pickle", "rb") as f_in:
+    data = pickle.load(f_in)
+
+features = data["features"]
+labels = data["labels"]
+
+plt.hist(features)
+plt.hist(features, range = [0,400])
+
+pos = features[labels]
+neg_index = np.array([not x for x in labels])
+neg = features[neg_index]
+
+bins = [0, 50, 100, 150, 200, 250, 300, 350, 400]
+
+plt.hist(pos, bins = bins)
+plt.hist(neg, bins = bins)
diff --git a/src/examples/example_dim_reduction.py b/src/examples/example_dim_reduction.py
new file mode 100644
index 00000000..8c739bb2
--- /dev/null
+++ b/src/examples/example_dim_reduction.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Examples for different kinds of dimensionality reduction
+"""
+
+from sklearn.datasets import load_breast_cancer
+from sklearn.decomposition import PCA
+from sklearn.linear_model import LogisticRegression
+from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif, SelectFromModel
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+
+data_set = load_breast_cancer()
+X = data_set.data
+y = data_set.target
+print("Data Set: ", X.shape, y.shape)
+print("Combinatorics of binary feature values:", 2**30)
+
+
+# PCA
+print("\nPCA")
+print('---')
+pca = PCA(random_state = 42)
+pca.fit(X)
+print("explained variance (percentage): ", pca.explained_variance_ratio_)
+print('most important component: ', pca.components_[0])
+pca_transformed = pca.transform(X)
+pca_transformed = pca_transformed[:,0:1]
+print("after transformation: ", pca_transformed.shape, y.shape)
+print("Compare: ", X[0], pca_transformed[0])
+
+
+# wrapper
+print("\nWrapper")
+print("-------")
+
+model = LogisticRegression(random_state = 42, max_iter = 10000)
+rfe = RFE(model, n_features_to_select = 2)
+rfe.fit(X,y)
+print("Feature ranking according to RFE/LogReg:", rfe.ranking_)
+index_of_first = np.where(rfe.ranking_ == 1)[0][0]
+index_of_second = np.where(rfe.ranking_ == 2)[0][0]
+print("Two most promising features: ", index_of_first, index_of_second)
+wrapper_transformed = rfe.transform(X)
+# or alternatively this does the same??
+# wrapper_transformed = features[:,[index_of_first,index_of_second]]
+# the line was in the preparatory dimensionalityReduction branch of lbechberger
+print("After transformation: ", wrapper_transformed.shape, y.shape)
+print("compare: ", X[0], wrapper_transformed[0])
+
+
+# Filter
+print("\n Filter")
+print("------")
+# mutual information (related to entropy and information gain when comparing data)
+skb = SelectKBest(score_func = mutual_info_classif, k = 3)
+skb.fit(X,y)
+print("Feature scores according to MI: ", skb.scores_)
+filter_transformed = skb.transform(X)
+print("After transformation: ", filter_transformed.shape, y.shape)
+print("Compare: ", X[0], filter_transformed[0])
+
+
+# Embedded
+print("\nEmbedded")
+print("--------")
+rf = RandomForestClassifier(n_estimators = 10, random_state=42)
+rf.fit(X,y)
+print("Feature importance according to RF: ", rf.feature_importances_)
+sfm = SelectFromModel(rf, threshold = 0.1, prefit = True)
+embedded_transformed = sfm.transform(X)
+print("After transformation: ", embedded_transformed.shape, y.shape)
+print("Compare: ", X[0], embedded_transformed[0])
diff --git a/src/examples/example_feature_extraction.py b/src/examples/example_feature_extraction.py
new file mode 100644
index 00000000..52ff25c4
--- /dev/null
+++ b/src/examples/example_feature_extraction.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Some examples for feature extraction:
+- bigrams
+- TF-IDF
+- WordNet synonyms
+- word embeddings from gensim word2vec-google-news-300 model
+- one hot encoding of categorical data
+  - day and night are similar in nature, but on the opposite ends of the array)
+"""
+
+# bigrams
+import nltk
+import string
+import csv
+import pandas as pd
+
+df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
+
+text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House."
+tokens = nltk.word_tokenize(text)
+tokens = [token for token in tokens if token not in string.punctuation]
+
+bigrams = nltk.bigrams(tokens)
+freq_dist = nltk.FreqDist(bigrams)
+freq_list = []
+for bigram, freq in freq_dist.items():
+    freq_list.append([bigram, freq])
+freq_list.sort(key = lambda x: x[1], reverse = True)
+for i in range(len(freq_list)):
+    print(freq_list[i])
+
+
+# tf-idf
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+tweets = df["tweet"][:100]
+vectorizer = TfidfVectorizer()
+tf_idf_vectors = vectorizer.fit_transform(tweets).todense()
+
+print(tf_idf_vectors.shape)
+print(vectorizer.get_feature_names()[142:145])
+print(tf_idf_vectors[66:71, 142:145])
+
+tf_idf_similarities = cosine_similarity(tf_idf_vectors)
+print(tf_idf_similarities[:5,:5])
+
+
+# NER
+text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House."
+sentences = nltk.sent_tokenize(text)
+for sentence in sentences:
+    words = nltk.word_tokenize(sentence)
+    pos_tagged = nltk.pos_tag(words)
+    ne_chunked = nltk.ne_chunk(pos_tagged)
+    print(ne_chunked)
+
+
+# WordNet
+dog_synsets = nltk.corpus.wordnet.synsets('dog')
+for syn in dog_synsets:
+    words = [str(lemma.name()) for lemma in syn.lemmas()]
+    print(syn, words, syn.definition(), syn.hypernyms())
+    print("")
+
+
+# word2vec
+import gensim.downloader as api
+
+embeddings = api.load('word2vec-google-news-300')
+pairs = [('car', 'minivan'), ('car', 'airplane'), ('car', 'cereal')]
+
+for w1, w2 in pairs:
+    print("{0} - {1}: {2}".format(w1, w2, embeddings.similarity(w1, w2)))
+
+dog_vector = embeddings['dog']
+
+
+# one hot encoding
+from sklearn.preprocessing import OneHotEncoder
+import numpy as np
+
+features = np.array([["morning"], ["afternoon"], ["evening"], ["night"], ["afternoon"]])
+encoder = OneHotEncoder(sparse = False)
+encoder.fit(features)
+encoder.transform(features)
diff --git a/src/examples/examples_parsing_with_ast.py b/src/examples/examples_parsing_with_ast.py
new file mode 100644
index 00000000..11b3e4f7
--- /dev/null
+++ b/src/examples/examples_parsing_with_ast.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Example: parsing a double quoted list of strings
+"""
+
+"""
+string wrapping around a list of strings parsed to a list of strings
+"""
+
+import csv
+import ast
+import pandas as pd
+
+df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting=csv.QUOTE_NONNUMERIC, lineterminator="\n")
+tokenized_string = df["tweet_tokenized"][0]
+tokenized_list = ast.literal_eval(tokenized_string)
\ No newline at end of file
diff --git a/src/feature_extraction.sh b/src/feature_extraction.sh
new file mode 100755
index 00000000..7988cdd3
--- /dev/null
+++ b/src/feature_extraction.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/feature_extraction/
+
+# run feature extraction on training set (may need to fit extractors)
+echo "  training set"
+python -m src.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle
+
+# run feature extraction on validation set and test set (with pre-fit extractors)
+echo "  validation set"
+python -m src.feature_extraction.extract_features data/preprocessing/split/validation.csv data/feature_extraction/validation.pickle -i data/feature_extraction/pipeline.pickle
+echo "  test set"
+python -m src.feature_extraction.extract_features data/preprocessing/split/test.csv data/feature_extraction/test.pickle -i data/feature_extraction/pipeline.pickle
\ No newline at end of file
diff --git a/code/feature_extraction/__init__.py b/src/feature_extraction/__init__.py
similarity index 100%
rename from code/feature_extraction/__init__.py
rename to src/feature_extraction/__init__.py
diff --git a/src/feature_extraction/extract_features.py b/src/feature_extraction/extract_features.py
new file mode 100644
index 00000000..eee619b5
--- /dev/null
+++ b/src/feature_extraction/extract_features.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Runs the specified collection of feature extractors.
+"""
+
+import argparse, csv, pickle
+
+import pandas as pd
+import numpy as np
+
+from src.feature_extraction.feature_extractors.character_length import CharacterLengthFE
+from src.feature_extraction.feature_extractors.counter_fe import CounterFE
+from src.feature_extraction.feature_extractors.feature_collector import FeatureCollector
+from src.feature_extraction.feature_extractors.sentiment_fe import SentimentFE
+from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET
+from src.util import COLUMN_LABEL, COLUMN_HASHTAGS , COLUMN_URLS
+from src.util import COLUMN_CASHTAGS, COLUMN_REPLY_TO, COLUMN_TWEET_TOKENIZED
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Feature Extraction")
+    parser.add_argument("input_file", help = "path to the input csv file")
+    parser.add_argument("output_file", help = "path to the output pickle file")
+    parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
+    parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)    
+    args = parser.parse_args()
+
+    df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
+
+    is_feature_collector_provided = args.import_file is not None
+    
+    if is_feature_collector_provided:
+        feature_collector = get_feature_collector_from_file(args.import_file)
+    else:
+        feature_collector = create_and_fit_feature_collector(df)
+
+    # apply the given FeatureCollector on the current data set
+    # maps the pandas DataFrame to an numpy array
+    feature_array = feature_collector.transform(df)
+
+    # get label array
+    label_array = np.array(df[COLUMN_LABEL])
+    label_array = label_array.reshape(-1, 1)
+
+    # store the results
+    results = {"features": feature_array, "labels": label_array, 
+            "feature_names": feature_collector.get_feature_names()}
+    with open(args.output_file, 'wb') as f_out:
+        pickle.dump(results, f_out)
+
+    # export the FeatureCollector as pickle file if desired by user
+    if args.export_file is not None:
+        with open(args.export_file, 'wb') as f_out:
+            pickle.dump(feature_collector, f_out)
+
+def get_feature_collector_from_file(filepath):
+    with open(filepath, "rb") as f_in:
+        feature_collector = pickle.load(f_in)
+    return feature_collector
+    
+def create_and_fit_feature_collector(df: pd.DataFrame):
+
+    featureExtractors = instantiate_feature_extractors()
+    feature_collector = FeatureCollector(featureExtractors)
+    feature_collector.fit(df) # assumed to be training data
+
+    return feature_collector
+
+def instantiate_feature_extractors():
+    featureExtractors = []
+
+    featureExtractors.append(CharacterLengthFE(COLUMN_TWEET))
+    featureExtractors.append(SentimentFE(COLUMN_TWEET))
+
+    count_columns = [
+        COLUMN_MENTIONS,
+        COLUMN_PHOTOS,
+        COLUMN_HASHTAGS,
+        COLUMN_URLS,
+        COLUMN_CASHTAGS,
+        COLUMN_REPLY_TO,
+        COLUMN_TWEET_TOKENIZED
+    ]
+    for count_columns in count_columns:
+        featureExtractors.append(CounterFE(count_columns))
+
+    return  featureExtractors
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/code/feature_extraction/bigrams.py b/src/feature_extraction/feature_extractors/bigrams.py
similarity index 80%
rename from code/feature_extraction/bigrams.py
rename to src/feature_extraction/feature_extractors/bigrams.py
index 6c0c4b3a..0096bccd 100644
--- a/code/feature_extraction/bigrams.py
+++ b/src/feature_extraction/feature_extractors/bigrams.py
@@ -1,14 +1,12 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Created on Thu Oct  7 14:53:52 2021
-
-@author: ml
+Bigram Feature
 """
 
 import ast
 import nltk
-from code.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 class BigramFeature(FeatureExtractor):
     
diff --git a/code/feature_extraction/character_length.py b/src/feature_extraction/feature_extractors/character_length.py
similarity index 77%
rename from code/feature_extraction/character_length.py
rename to src/feature_extraction/feature_extractors/character_length.py
index 0349bf94..53e8a9e1 100644
--- a/code/feature_extraction/character_length.py
+++ b/src/feature_extraction/feature_extractors/character_length.py
@@ -1,18 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Simple feature that counts the number of characters in the given column.
-
-Created on Wed Sep 29 12:29:25 2021
-
-@author: lbechberger
+Simple feature that counts the number of characters in the given column (e.g tweet column).
 """
 
 import numpy as np
-from code.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 # class for extracting the character-based length as a feature
-class CharacterLength(FeatureExtractor):
+class CharacterLengthFE(FeatureExtractor):
     
     # constructor
     def __init__(self, input_column):
diff --git a/src/feature_extraction/feature_extractors/counter_fe.py b/src/feature_extraction/feature_extractors/counter_fe.py
new file mode 100644
index 00000000..383fe592
--- /dev/null
+++ b/src/feature_extraction/feature_extractors/counter_fe.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Given an input column, this class counts the length of the array per record
+"""
+
+import numpy as np
+import ast
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
+
+class CounterFE(FeatureExtractor):
+    """
+    Parses the string in every cell of the column/series as an list
+    and counts the length in the cell of the output column
+    """
+    
+    # constructor
+    def __init__(self, input_column):
+        super().__init__([input_column], f"{input_column}_count")
+
+
+    # don't need to fit, so don't overwrite _set_variables()
+
+    def _get_values(self, inputs):
+        evaluated = inputs[0].apply(ast.literal_eval)
+        result = np.array(evaluated.str.len())
+        result = result.reshape(-1,1)
+        return result
diff --git a/code/feature_extraction/feature_collector.py b/src/feature_extraction/feature_extractors/feature_collector.py
similarity index 89%
rename from code/feature_extraction/feature_collector.py
rename to src/feature_extraction/feature_extractors/feature_collector.py
index d2fca494..b0c0e985 100644
--- a/code/feature_extraction/feature_collector.py
+++ b/src/feature_extraction/feature_extractors/feature_collector.py
@@ -2,14 +2,10 @@
 # -*- coding: utf-8 -*-
 """
 Collects the feature values from many different feature extractors.
-
-Created on Wed Sep 29 12:36:01 2021
-
-@author: lbechberger
 """
 
 import numpy as np
-from code.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 # extend FeatureExtractor for the sake of simplicity
 class FeatureCollector(FeatureExtractor):
@@ -26,7 +22,7 @@ def __init__(self, features):
             input_columns += feature.get_input_columns()
         
         # remove duplicate columns
-        input_colums = list(set(input_columns))
+        input_columns = list(set(input_columns))
         
         # call constructor of super class
         super().__init__(input_columns, "FeatureCollector")
diff --git a/code/feature_extraction/feature_extractor.py b/src/feature_extraction/feature_extractors/feature_extractor.py
similarity index 95%
rename from code/feature_extraction/feature_extractor.py
rename to src/feature_extraction/feature_extractors/feature_extractor.py
index e8db5d84..c2302d57 100644
--- a/code/feature_extraction/feature_extractor.py
+++ b/src/feature_extraction/feature_extractors/feature_extractor.py
@@ -2,12 +2,8 @@
 # -*- coding: utf-8 -*-
 """
 Base class for all of our feature extractors.
-
-Created on Wed Sep 29 12:22:13 2021
-
-@author: lbechberger
 """
-
+import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
 
 # base class for all feature extractors
@@ -53,7 +49,7 @@ def fit(self, df):
     # get feature values based on input column and internal variables
     # should return a numpy array
     # to be implemented by subclass!
-    def _get_values(self, inputs):
+    def _get_values(self, inputs) -> np.ndarray:
         pass
         
     # transform function: transforms pandas DataFrame to numpy array of feature values
diff --git a/src/feature_extraction/feature_extractors/sentiment_fe.py b/src/feature_extraction/feature_extractors/sentiment_fe.py
new file mode 100644
index 00000000..440a1e2d
--- /dev/null
+++ b/src/feature_extraction/feature_extractors/sentiment_fe.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+import numpy as np
+import pandas as pd
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
+
+
+class SentimentFE(FeatureExtractor):
+    """
+    Analyzes the sentiment of a given text input column (e.g. the sentiment of a tweet).
+    The result is outputted as two columns (positive and negative sentiment).
+    """
+    
+    # constructor
+    def __init__(self, input_column):
+        super().__init__([input_column], f"{input_column}_sentiment")
+
+
+    # don't need to fit, so don't overwrite _set_variables()
+
+    def _get_values(self, inputs: pd.Series):
+
+        analyzer = SentimentIntensityAnalyzer()
+        sentiments = inputs[0].apply(lambda x: analyzer.polarity_scores(x))
+
+        temp_df = pd.DataFrame.from_records(sentiments)
+            
+        collected_rows = []
+        for _, row in temp_df.iterrows():
+            temp_row = [
+                row["pos"], # positive
+                row["neg"], # negative
+                # other values are left out, 
+                # because they contain no new information
+                # row["compound"], # sum of pos and neu
+                # row["neu"], # neutral
+            ]
+            collected_rows.append(temp_row)
+                
+        result = np.array(collected_rows)
+               
+        return result
diff --git a/code/preprocessing/__init__.py b/src/feature_extraction/test/__init__.py
similarity index 100%
rename from code/preprocessing/__init__.py
rename to src/feature_extraction/test/__init__.py
diff --git a/test/feature_extraction/bigrams_test.py b/src/feature_extraction/test/bigrams_test.py
similarity index 92%
rename from test/feature_extraction/bigrams_test.py
rename to src/feature_extraction/test/bigrams_test.py
index 29abfdae..81e3e12f 100644
--- a/test/feature_extraction/bigrams_test.py
+++ b/src/feature_extraction/test/bigrams_test.py
@@ -1,15 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Created on Thu Oct  7 14:51:00 2021
-
-@author: ml
+Test the bigram feature
 """
 
 import unittest
 import pandas as pd
 import nltk
-from code.feature_extraction.bigrams import BigramFeature
+from src.feature_extraction.bigrams import BigramFeature
 
 class BigramFeatureTest(unittest.TestCase):
     
diff --git a/src/feature_extraction/test/count_test.py b/src/feature_extraction/test/count_test.py
new file mode 100644
index 00000000..50e83c4a
--- /dev/null
+++ b/src/feature_extraction/test/count_test.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Tests feature extraction
+"""
+
+import unittest
+import pandas as pd
+import numpy as np
+from src.feature_extraction.counter_fe import CounterFE
+
+class CountFeatureTest(unittest.TestCase):
+    
+    def setUp(self):       
+        self.INPUT_COLUMN = "mockcolumn"
+        self.count_feature_extractor = CounterFE(self.INPUT_COLUMN)
+        
+        self.df = pd.DataFrame()        
+        self.df[self.INPUT_COLUMN] = [
+            "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}, {'screen_name': 'amishdevgan', 'name': 'amish devgan', 'id': '163817624'}]",
+            "[]",
+            "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}]"
+        ]
+
+
+    def test_feature_name(self):
+        self.assertEqual(self.count_feature_extractor.get_feature_name(), "mockcolumn_count")
+
+
+    def test_counting(self):
+        
+        self.count_feature_extractor.fit(self.df)
+
+        actual_feature = self.count_feature_extractor.transform(self.df)        
+        EXPECTED = np.array(pd.DataFrame({"mockcolumn_count": [2, 0, 1]}))
+
+        isEqual = np.array_equal(actual_feature, EXPECTED, equal_nan=False)
+        self.assertTrue(isEqual)
+        
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/src/feature_extraction/test/sentiment_test.py b/src/feature_extraction/test/sentiment_test.py
new file mode 100644
index 00000000..73ba2f0e
--- /dev/null
+++ b/src/feature_extraction/test/sentiment_test.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Tests feature extraction
+"""
+
+import unittest
+import pandas as pd
+import logging
+import numpy as np
+from src.feature_extraction.sentiment_fe import SentimentFE
+
+
+class CountFeatureTest(unittest.TestCase):
+
+    def setUp(self):
+        logging.basicConfig()
+        self.log = logging.getLogger("LOG")
+
+        self.INPUT_COLUMN = "mockcolumn"
+        self.sentimentFE = SentimentFE(self.INPUT_COLUMN)
+
+        self.df = pd.DataFrame()
+
+        # example taken from official code examples
+        # https://github.com/cjhutto/vaderSentiment#code-examples
+        self.df[self.INPUT_COLUMN] = [
+            # positive sentence example
+            "VADER is smart, handsome, and funny.",
+            # punctuation emphasis handled correctly (sentiment intensity adjusted)
+            "VADER is smart, handsome, and funny!",
+            # booster words handled correctly (sentiment intensity adjusted)
+            "VADER is very smart, handsome, and funny.",
+        ]
+
+    def test_feature_name(self):
+        self.assertEqual(self.sentimentFE.get_feature_name(),
+                         "mockcolumn_sentiment")
+
+    def test_counting(self):
+
+        self.sentimentFE.fit(self.df)
+
+        actual_feature = self.sentimentFE.transform(self.df)
+        # actual_feature = actual_feature.to_numpy()
+
+        expected_values = [
+            {'pos': 0.746, 'compound': 0.8316, 'neu': 0.254, 'neg': 0.0},
+            {'pos': 0.752, 'compound': 0.8439, 'neu': 0.248, 'neg': 0.0},
+            {'pos': 0.701, 'compound': 0.8545, 'neu': 0.299, 'neg': 0.0}
+        ]
+
+        EXPECTED = np.array(pd.DataFrame({"mockcolumn_sentiment": expected_values}))
+        # EXPECTED = np.array(expected_values)
+
+        self.log.warning("actual_feature", actual_feature)
+        self.log.warning("EXPECTED", EXPECTED)
+
+        self.assertEqual(actual_feature[0], EXPECTED[0])
+
+        isEqual = np.array_equal(actual_feature, EXPECTED, equal_nan=False)
+        self.assertTrue(isEqual)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/final_classification.sh b/src/final_classification.sh
new file mode 100644
index 00000000..8d0ff1fa
--- /dev/null
+++ b/src/final_classification.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/classification/
+
+
+RUN_NAME="FINAL RUN KNN k=1"
+
+python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --knn 1 --metrics none -n "${RUN_NAME}"
+
+# run feature extraction on test set (with pre-fit classifier)
+python -m src.classification.run_classifier data/dimensionality_reduction/test.pickle -i data/classification/classifier.pickle --metrics all -n "${RUN_NAME}"
+python -m src.classification.run_classifier data/dimensionality_reduction/test.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier stratified --metrics all -n "${RUN_NAME}"
+
+RUN_NAME="FINAL RUN random forest n=9"
+python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --randomforest 9 --metrics none -n "${RUN_NAME}"
+
+# run feature extraction on test set (with pre-fit classifier)
+python -m src.classification.run_classifier data/dimensionality_reduction/test.pickle -i data/classification/classifier.pickle --metrics all -n "${RUN_NAME}"
+python -m src.classification.run_classifier data/dimensionality_reduction/test.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier stratified --metrics all -n "${RUN_NAME}"
\ No newline at end of file
diff --git a/code/load_data.sh b/src/load_data.sh
similarity index 100%
rename from code/load_data.sh
rename to src/load_data.sh
diff --git a/src/load_nltk_data.sh b/src/load_nltk_data.sh
new file mode 100644
index 00000000..4990a95f
--- /dev/null
+++ b/src/load_nltk_data.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# install all NLTK models https://www.nltk.org/data.html
+echo "installing nltk data sets, corpora and models"
+# TODO this could be optimized. Not everything needs to be downloaded
+python -m nltk.downloader all 
\ No newline at end of file
diff --git a/code/pipeline.sh b/src/pipeline.sh
similarity index 52%
rename from code/pipeline.sh
rename to src/pipeline.sh
index 8cfef559..691bef9b 100755
--- a/code/pipeline.sh
+++ b/src/pipeline.sh
@@ -1,13 +1,11 @@
 #!/bin/bash
 # overall pipeline for the ML experiments
 
-echo "loading data"
-code/load_data.sh
 echo "preprocessing"
-code/preprocessing.sh
+src/preprocessing.sh
 echo "feature extraction"
-code/feature_extraction.sh
+src/feature_extraction.sh
 echo "dimensionality reduction"
-code/dimensionality_reduction.sh
+src/dimensionality_reduction.sh
 echo "classification"
-code/classification.sh
\ No newline at end of file
+src/classification.sh
\ No newline at end of file
diff --git a/src/preprocessing.sh b/src/preprocessing.sh
new file mode 100755
index 00000000..93037282
--- /dev/null
+++ b/src/preprocessing.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# create directory if not yet existing
+mkdir -p data/preprocessing/split/
+
+# add labels
+echo "  creating labels"
+python -m src.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv
+
+# other preprocessing (removing punctuation etc.)
+echo "  general preprocessing"
+python -m src.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize --other -e data/preprocessing/pipeline.pickle
+
+# split the data set
+echo "  splitting the data set"
+python -m src.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42
\ No newline at end of file
diff --git a/test/__init__.py b/src/preprocessing/__init__.py
similarity index 100%
rename from test/__init__.py
rename to src/preprocessing/__init__.py
diff --git a/src/preprocessing/create_labels.py b/src/preprocessing/create_labels.py
new file mode 100644
index 00000000..881452a3
--- /dev/null
+++ b/src/preprocessing/create_labels.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Reads in the original csv files and creates labels for the data points.
+Stores the result as a single pandas DataFrame in a pickle file.
+"""
+
+import os, argparse, csv
+import pandas as pd
+from src.util import COLUMN_LIKES, COLUMN_RETWEETS, COLUMN_LABEL
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Creation of Labels")
+    parser.add_argument("data_directory", help = "directory where the original csv files reside")
+    parser.add_argument("output_file", help = "path to the output csv file")
+    parser.add_argument("-l", '--likes_weight', type = int, help = "weight of likes", default = 1)
+    parser.add_argument("-r", '--retweet_weight', type = int, help = "weight of retweets", default = 1)
+    parser.add_argument("-t", '--threshold', type = int, help = "threshold to surpass for positive class", default = 50)
+    args = parser.parse_args()
+
+    # get all csv files in data_directory
+    file_paths = [args.data_directory + f for f in os.listdir(args.data_directory) if f.endswith(".csv")]
+
+    # load all csv files
+    dfs = []
+    for file_path in file_paths:
+        dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", dtype={"quote_url": str, "place": str, "tweet": str, "language": str}))
+
+    # join all data into a single DataFrame
+    df = pd.concat(dfs)
+
+    # compute new column "label" based on likes and retweets
+    df[COLUMN_LABEL] = (args.likes_weight * df[COLUMN_LIKES] + args.retweet_weight * df[COLUMN_RETWEETS]) > args.threshold
+
+    # print statistics
+    print("Number of tweets: {0}".format(len(df)))
+    print("Label distribution:")
+    print(df[COLUMN_LABEL].value_counts(normalize = True))
+
+    # store the DataFrame into a csv file
+    df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test/feature_extraction/__init__.py b/src/preprocessing/preprocessors/__init__.py
similarity index 100%
rename from test/feature_extraction/__init__.py
rename to src/preprocessing/preprocessors/__init__.py
diff --git a/src/preprocessing/preprocessors/column_dropper.py b/src/preprocessing/preprocessors/column_dropper.py
new file mode 100644
index 00000000..abd9280f
--- /dev/null
+++ b/src/preprocessing/preprocessors/column_dropper.py
@@ -0,0 +1,20 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+class ColumnDropper(BaseEstimator, TransformerMixin):
+
+    def __init__(self, cols):
+
+        if not isinstance(cols, list):
+            self.cols = [cols]
+        else:
+            self.cols = cols
+
+    def fit(self, df):
+        # there is nothing to fit
+        return self
+
+    def transform(self, X_df: pd.DataFrame):
+        df = X_df.drop(columns=self.cols)
+        return df
+
diff --git a/src/preprocessing/preprocessors/non_english_remover.py b/src/preprocessing/preprocessors/non_english_remover.py
new file mode 100644
index 00000000..5309deaf
--- /dev/null
+++ b/src/preprocessing/preprocessors/non_english_remover.py
@@ -0,0 +1,13 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+class NonEnglishRemover(BaseEstimator, TransformerMixin):
+
+    def fit(self, df):
+        # there is nothing to fit
+        return self
+
+    def transform(self, X_df: pd.DataFrame):        
+        df = X_df[X_df.language == 'en']
+        return df
+
diff --git a/code/preprocessing/preprocessor.py b/src/preprocessing/preprocessors/preprocessor.py
similarity index 96%
rename from code/preprocessing/preprocessor.py
rename to src/preprocessing/preprocessors/preprocessor.py
index a5abd445..6514b99d 100644
--- a/code/preprocessing/preprocessor.py
+++ b/src/preprocessing/preprocessors/preprocessor.py
@@ -2,10 +2,6 @@
 # -*- coding: utf-8 -*-
 """
 Superclass for all preprocessors.
-
-Created on Tue Sep 28 17:06:35 2021
-
-@author: lbechberger
 """
 
 from sklearn.base import BaseEstimator, TransformerMixin
diff --git a/code/preprocessing/punctuation_remover.py b/src/preprocessing/preprocessors/punctuation_remover.py
similarity index 83%
rename from code/preprocessing/punctuation_remover.py
rename to src/preprocessing/preprocessors/punctuation_remover.py
index 0f026b0e..47839fc8 100644
--- a/code/preprocessing/punctuation_remover.py
+++ b/src/preprocessing/preprocessors/punctuation_remover.py
@@ -2,15 +2,11 @@
 # -*- coding: utf-8 -*-
 """
 Preprocessor that removes punctuation from the original tweet text.
-
-Created on Wed Sep 29 09:45:56 2021
-
-@author: lbechberger
 """
 
 import string
-from code.preprocessing.preprocessor import Preprocessor
-from code.util import COLUMN_TWEET, COLUMN_PUNCTUATION
+from src.preprocessing.preprocessors.preprocessor import Preprocessor
+from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION
 
 # removes punctuation from the original tweet
 # inspired by https://stackoverflow.com/a/45600350
diff --git a/code/preprocessing/tokenizer.py b/src/preprocessing/preprocessors/tokenizer.py
similarity index 89%
rename from code/preprocessing/tokenizer.py
rename to src/preprocessing/preprocessors/tokenizer.py
index 94191502..9a3244d2 100644
--- a/code/preprocessing/tokenizer.py
+++ b/src/preprocessing/preprocessors/tokenizer.py
@@ -2,13 +2,9 @@
 # -*- coding: utf-8 -*-
 """
 Tokenize the tweet into individual words.
-
-Created on Wed Oct  6 13:59:54 2021
-
-@author: lbechberger
 """
 
-from code.preprocessing.preprocessor import Preprocessor
+from src.preprocessing.preprocessors.preprocessor import Preprocessor
 import nltk
 
 class Tokenizer(Preprocessor):
diff --git a/src/preprocessing/run_preprocessing.py b/src/preprocessing/run_preprocessing.py
new file mode 100644
index 00000000..88e4b2a2
--- /dev/null
+++ b/src/preprocessing/run_preprocessing.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Runs the specified collection of preprocessing steps
+"""
+
+import argparse, csv, pickle
+
+import pandas as pd
+
+from sklearn.pipeline import make_pipeline
+from src.preprocessing.preprocessors.column_dropper import ColumnDropper
+from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover
+from src.preprocessing.preprocessors.punctuation_remover import PunctuationRemover
+from src.preprocessing.preprocessors.tokenizer import Tokenizer
+from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Various preprocessing steps")
+    parser.add_argument("input_file", help = "path to the input csv file")
+    parser.add_argument("output_file", help = "path to the output csv file")
+    parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation")
+    parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words")
+    parser.add_argument("-o", "--other", action = "store_true", help = "remove non-english tweets and unnecessary columns")
+    parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET)
+    parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
+    args = parser.parse_args()
+
+    # load data
+    df = pd.read_csv(args.input_file,
+                     quoting=csv.QUOTE_NONNUMERIC,
+                     lineterminator="\n",
+                     verbose=False,
+                     dtype={"quote_url": object, "place": object, "tweet": object, "language": object, "thumbnail": object},
+                     converters={'mentions': eval, 'photos': eval, 'urls': eval})
+
+    # collect all preprocessors
+    preprocessors = []
+    if args.punctuation:
+        preprocessors.append(PunctuationRemover())
+    if args.tokenize:
+        preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED))
+    if args.other:   
+        preprocessors.append(NonEnglishRemover())
+        DROP_COLS = [            
+            "id", "conversation_id", "created_at", "timezone", "user_id", "name", "place",
+            "replies_count", "retweets_count", "likes_count", "language",
+            # "cashtag" only few records have this filled. Might be useless
+            # below columns have always the same value for all records
+            "retweet", "near", "geo", "source", "user_rt_id", "user_rt", "retweet_id",
+            "retweet_date", "translate", "trans_src", 'trans_dest\r']
+        
+        preprocessors.append(ColumnDropper(DROP_COLS))
+
+    # call all preprocessing steps
+    for preprocessor in preprocessors:
+        df = preprocessor.fit_transform(df)
+
+    # store the results
+    df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
+
+    # create a pipeline if necessary and store it as pickle file
+    if args.export_file is not None:
+        pipeline = make_pipeline(*preprocessors)
+        with open(args.export_file, 'wb') as f_out:
+            pickle.dump(pipeline, f_out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/preprocessing/split_data.py b/src/preprocessing/split_data.py
new file mode 100644
index 00000000..337b5406
--- /dev/null
+++ b/src/preprocessing/split_data.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Splits the preprocessed data into training, validation, and test set.
+"""
+
+import os, argparse, csv
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from src.util import COLUMN_LABEL
+
+
+def main():
+    # setting up CLI
+    parser = argparse.ArgumentParser(description = "Splitting the data set")
+    parser.add_argument("input_file", help = "path to the input csv file")
+    parser.add_argument("output_folder", help = "path to the output folder")
+    parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None)
+    parser.add_argument("-t", '--test_size', type = float, help = "relative size of the test set", default = 0.2)
+    parser.add_argument("-v", '--validation_size', type = float, help = "relative size of the validation set", default = 0.2)
+    args = parser.parse_args()
+
+    # load the data
+    df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", dtype={"quote_url": object, "tweet": object, "thumbnail": object},)
+
+    # split into (training & validation) and test set
+    X, X_test = train_test_split(df, test_size = args.test_size, random_state = args.seed, shuffle = True, stratify = df[COLUMN_LABEL])
+
+    # split remainder into training and validation
+    relative_validation_size = args.validation_size / (1 - args.test_size)
+    X_train, X_val = train_test_split(X, test_size = relative_validation_size, random_state = args.seed, shuffle = True, stratify = X[COLUMN_LABEL])
+
+    # store the three data sets separately
+    X_train.to_csv(os.path.join(args.output_folder, "training.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
+    X_val.to_csv(os.path.join(args.output_folder, "validation.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
+    X_test.to_csv(os.path.join(args.output_folder, "test.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
+
+    print("Training: {0} examples, Validation: {1} examples, Test: {2} examples".format(len(X_train), len(X_val), len(X_test)))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test/preprocessing/__init__.py b/src/preprocessing/test/__init__.py
similarity index 100%
rename from test/preprocessing/__init__.py
rename to src/preprocessing/test/__init__.py
diff --git a/test/preprocessing/tokenizer_test.py b/src/preprocessing/test/tokenizer_test.py
similarity index 64%
rename from test/preprocessing/tokenizer_test.py
rename to src/preprocessing/test/tokenizer_test.py
index 1e008029..8f5b5795 100644
--- a/test/preprocessing/tokenizer_test.py
+++ b/src/preprocessing/test/tokenizer_test.py
@@ -1,14 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Created on Thu Oct  7 14:30:41 2021
-
-@author: ml
+Tests the Tokenizer
 """
 
 import unittest
 import pandas as pd
-from code.preprocessing.tokenizer import Tokenizer
+from src.preprocessing.tokenizer import Tokenizer
+from src.util import fm
 
 class TokenizerTest(unittest.TestCase):
     
@@ -27,15 +26,17 @@ def test_output_column(self):
         self.assertEqual(self.tokenizer._output_column, self.OUTPUT_COLUMN)
 
     def test_tokenization_single_sentence(self):
-        input_text = "This is an example sentence"
-        output_text = "['This', 'is', 'an', 'example', 'sentence']"
+        input_sentence = "This is an example sentence"
+        expected_output_text = "['This', 'is', 'an', 'example', 'sentence']"
         
         input_df = pd.DataFrame()
-        input_df[self.INPUT_COLUMN] = [input_text]
+        input_df[self.INPUT_COLUMN] = [input_sentence]
         
         tokenized = self.tokenizer.fit_transform(input_df)
-        self.assertEqual(tokenized[self.OUTPUT_COLUMN][0], output_text)
-    
+
+        msg = fm("a sentence as a string", "return a list of words as a string")
+        self.assertEqual(tokenized[self.OUTPUT_COLUMN][0], expected_output_text, msg)        
+
 
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/src/setup.sh b/src/setup.sh
new file mode 100644
index 00000000..da6c037f
--- /dev/null
+++ b/src/setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+echo "loading data"
+src/load_data.sh
+
+echo "loading nltk data"
+src/load_nltk_data.sh
+
diff --git a/src/util.py b/src/util.py
new file mode 100644
index 00000000..213e183b
--- /dev/null
+++ b/src/util.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Utility file for collecting frequently used constants and helper function.
+"""
+
+# column names for the original data frame
+COLUMN_RETWEETS = "retweets_count"
+COLUMN_LIKES = "likes_count"
+
+COLUMN_TWEET = "tweet"
+COLUMN_MENTIONS = "mentions"
+COLUMN_PHOTOS = "photos"
+COLUMN_HASHTAGS = "hashtags"
+COLUMN_URLS = "urls"
+COLUMN_CASHTAGS = "cashtags"
+COLUMN_REPLY_TO = "reply_to"
+COLUMN_TWEET_TOKENIZED = "tweet_tokenized"
+
+# column names of novel columns for preprocessing
+COLUMN_LABEL = "label"
+COLUMN_PUNCTUATION = "tweet_no_punctuation"
+
+SUFFIX_TOKENIZED = "_tokenized"
+
+
+
+def fm(given: str, should: "str"):
+    """
+    Formats the passed given and should as a string across two lines.
+    Can be used for documenting assertions in tests.
+    And helps understanding what went wrong in failed tests
+    
+    given: a sentence as a string,
+    should: return a list of words as a string
+    """
+    return _format_message(given, should)
+
+
+def _format_message(given: str, should: "str"):
+    """
+    See docstring of fm function
+    Other example
+    given: 'no arguments',
+    should: 'return 0',
+    """
+    return f"\n given: {given},\n should: {should}"
\ No newline at end of file
diff --git a/src/visualization.py b/src/visualization.py
new file mode 100644
index 00000000..b68e152d
--- /dev/null
+++ b/src/visualization.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Visualizes some aspects of the data set.
+To run this file you first need to run the preprocessing phase of the pipeline
+"""
+
+import csv
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+
+def main():
+
+    labeled_data_path = "data/preprocessing/labeled.csv"
+
+    my_file = Path(labeled_data_path)
+    if not my_file.is_file():
+        raise Exception("You need to run preprocessing before executing this script. " + 
+        "The concatenated data set `labeled.csv` is needed!")
+
+    df = pd.read_csv(labeled_data_path,
+                        quoting=csv.QUOTE_NONNUMERIC,
+                        lineterminator="\n",
+                        verbose=False,
+                        dtype={"quote_url": object, "place": object, "tweet": object, "language": object, "thumbnail": object},
+                        converters={'mentions': eval, 'photos': eval, 'urls': eval})
+
+    plt.figure(figsize=(20,5))
+    df.language.value_counts().plot(kind='bar')
+    plt.title('Shows distribution of tweets per language')    
+    plt.grid()
+    plt.show()
+
+    # lists number of tweets by language
+    print(df["language"].value_counts())
+
+    en_df = df[df["language"] == "en"]
+    non_en_df = df[df["language"] != "en"]
+    
+    en_count = len(en_df["language"]) # 283240 english tweets
+    non_en_count = len(non_en_df["language"])  # 12571 non english tweets
+    non_en_percentage = non_en_count * 100 / en_count    
+    print(non_en_percentage) # 4.438 percent
+    
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file