diff --git a/NLP_notebook.ipynb b/NLP_notebook.ipynb index d820afb..a2bca6e 100644 --- a/NLP_notebook.ipynb +++ b/NLP_notebook.ipynb @@ -45,7 +45,9 @@ { "cell_type": "code", "execution_count": 251, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import keras\n", @@ -67,17 +69,21 @@ { "cell_type": "code", "execution_count": 252, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "input_file = codecs.open(\"socialmedia_relevant_cols.csv\", \"r\",encoding='utf-8', errors='replace')\n", - "output_file = open(\"socialmedia_relevant_cols_clean.csv\", \"w\")\n", + "input_fn = 'socialmedia_relevant_cols.csv'\n", + "output_fn = 'socialmedia_relevant_cols_clean.csv'\n", "\n", - "def sanitize_characters(raw, clean): \n", - " for line in input_file:\n", - " out = line\n", - " output_file.write(line)\n", - "sanitize_characters(input_file, output_file)" + "def sanitize_characters(raw, clean):\n", + " with codecs.open(raw, \"r\",encoding='utf-8', errors='replace') as input_file:\n", + " with open(clean, \"w\") as output_file:\n", + " for line in input_file:\n", + " out = line\n", + " output_file.write(line.replace('#', ''))\n", + "sanitize_characters(input_fn, output_fn)" ] }, { @@ -91,7 +97,9 @@ { "cell_type": "code", "execution_count": 253, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -177,7 +185,9 @@ { "cell_type": "code", "execution_count": 254, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -268,7 +278,9 @@ { "cell_type": "code", "execution_count": 255, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -362,7 +374,9 @@ { "cell_type": "code", "execution_count": 256, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -458,7 +472,9 @@ { "cell_type": "code", "execution_count": 257, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -571,6 +587,7 @@ "cell_type": "code", "execution_count": 258, "metadata": { + "collapsed": false, "scrolled": true }, "outputs": [ @@ -667,7 +684,9 @@ { "cell_type": "code", "execution_count": 259, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -782,7 +801,9 @@ { "cell_type": "code", "execution_count": 260, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -808,7 +829,9 @@ { "cell_type": "code", "execution_count": 261, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -861,7 +884,9 @@ { "cell_type": "code", "execution_count": 262, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", @@ -895,7 +920,9 @@ { "cell_type": "code", "execution_count": 263, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -951,7 +978,9 @@ { "cell_type": "code", "execution_count": 264, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", @@ -974,7 +1003,9 @@ { "cell_type": "code", "execution_count": 265, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -1017,7 +1048,9 @@ { "cell_type": "code", "execution_count": 266, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -1054,7 +1087,9 @@ { "cell_type": "code", "execution_count": 267, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1097,7 +1132,9 @@ { "cell_type": "code", "execution_count": 268, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def get_most_important_features(vectorizer, model, n=5):\n", @@ -1122,7 +1159,9 @@ { "cell_type": "code", "execution_count": 269, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1195,7 +1234,9 @@ { "cell_type": "code", "execution_count": 270, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def tfidf(data):\n", @@ -1212,7 +1253,9 @@ { "cell_type": "code", "execution_count": 271, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1241,7 +1284,9 @@ { "cell_type": "code", "execution_count": 272, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "clf_tfidf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', \n", @@ -1254,7 +1299,9 @@ { "cell_type": "code", "execution_count": 273, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -1280,7 +1327,9 @@ { "cell_type": "code", "execution_count": 274, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1336,7 +1385,9 @@ { "cell_type": "code", "execution_count": 275, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "importance_tfidf = get_most_important_features(tfidf_vectorizer, clf_tfidf, 10)" @@ -1345,7 +1396,9 @@ { "cell_type": "code", "execution_count": 276, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1388,7 +1441,9 @@ { "cell_type": "code", "execution_count": 277, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import gensim\n", @@ -1400,7 +1455,9 @@ { "cell_type": "code", "execution_count": 278, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):\n", @@ -1424,7 +1481,9 @@ { "cell_type": "code", "execution_count": 279, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "embeddings = get_word2vec_embeddings(word2vec, clean_questions)\n", @@ -1435,7 +1494,9 @@ { "cell_type": "code", "execution_count": 280, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1464,7 +1525,9 @@ { "cell_type": "code", "execution_count": 281, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "clf_w2v = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', \n", @@ -1476,7 +1539,9 @@ { "cell_type": "code", "execution_count": 282, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -1502,7 +1567,9 @@ { "cell_type": "code", "execution_count": 283, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1564,7 +1631,9 @@ { "cell_type": "code", "execution_count": 284, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from lime import lime_text\n", @@ -1590,7 +1659,9 @@ { "cell_type": "code", "execution_count": 285, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def explain_one_instance(instance, class_names):\n", @@ -1608,7 +1679,9 @@ { "cell_type": "code", "execution_count": 286, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -37470,6 +37543,7 @@ "cell_type": "code", "execution_count": 287, "metadata": { + "collapsed": false, "scrolled": true }, "outputs": [ @@ -73332,7 +73406,9 @@ { "cell_type": "code", "execution_count": 288, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -73398,7 +73474,9 @@ { "cell_type": "code", "execution_count": 289, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -73451,7 +73529,9 @@ { "cell_type": "code", "execution_count": 290, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -73504,7 +73584,9 @@ { "cell_type": "code", "execution_count": 291, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from keras.layers import Dense, Input, Flatten, Dropout, Merge\n", @@ -73567,7 +73649,9 @@ { "cell_type": "code", "execution_count": 292, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -73590,7 +73674,9 @@ { "cell_type": "code", "execution_count": 293, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -73646,7 +73732,9 @@ { "cell_type": "code", "execution_count": 294, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# code from https://github.com/ajmanser/Yelp\n", @@ -73675,7 +73763,9 @@ { "cell_type": "code", "execution_count": 295, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def sample(preds, temperature=1.0):\n", @@ -73715,7 +73805,9 @@ { "cell_type": "code", "execution_count": 296, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -73759,7 +73851,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from nltk.corpus import wordnet as wn\n", @@ -73851,7 +73945,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -73901,7 +73997,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.0" } }, "nbformat": 4,