virality_predictor.py

# -*- coding: utf-8 -*-
"""Virality_Predictor.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1BGXZ5bG1TKQEsuU03b8gJEAxcrxN21Er

# Virality Predictor

### Import Modules
"""

import os
import pandas as pd
from datetime import datetime
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer

import re
from bs4 import BeautifulSoup

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

"""### Load Data

#### Shared Articles
"""

articles = pd.read_csv("/content/shared_articles.csv")
articles.shape

"""#### User Interactions"""

interactions = pd.read_csv("/content/users_interactions.csv")
len(interactions['contentId'].unique())

"""### Process and Analyse Data

#### Count Views, Likes, Comments, Follows and Bookmarks for each ContentID
"""

counts = interactions[['contentId', 'eventType', 'timestamp']].groupby(['contentId', 'eventType']).count()
counts.columns = ['count']

counts = pd.pivot_table(counts, values=['count'], index=['contentId'], columns=['eventType'], fill_value=0)['count']
counts.reset_index(inplace=True)
counts.head()

"""#### Calculate Virality

**Given Formulation: VIRALITY = 1 * VIEW + 4 * LIKE + 10 * COMMENT + 25 * FOLLOW + 100 * BOOKMARK**
"""

counts['VIRALITY'] = 1*counts['VIEW'] + 4*counts['LIKE'] + 10*counts['COMMENT CREATED'] + 25*counts['FOLLOW'] + 100*counts['BOOKMARK']
counts.head()

"""#### Analyse Distribution of Count Data"""

counts.describe()

"""#### Analyse ContentIDs in articles and user interactions"""

article_unique = set(articles['contentId'])
interactions_unique = set(counts['contentId'])

print("Number of unique ContentID in articles: ", len(article_unique))
print("Number of ContentID in counts: ", len(interactions_unique))

content_ids_not_available = article_unique.difference(interactions_unique)
print("Number of ContentID not present in interactions: ", len(content_ids_not_available))

content_ids_not_available = interactions_unique.difference(article_unique)
print("Number of ContentID not present in articles: ", len(content_ids_not_available))

"""##### Hence, the dataset has no user interaction information for 70 articles.

#### Join Articles and Counts of User Interaction
"""

data = articles.set_index('contentId').join(counts.set_index('contentId'), how = 'inner')
data.head()

"""#### Remove Content Removed Rows"""

data = data[data['eventType'] != 'CONTENT REMOVED']

"""#### Drop URL, contentType, eventType, NaN and ID columns

##### Identify columns having NaN
"""

data.isnull().any()

"""##### Analyse NaN columns"""

print("Number of rows with country information:", len(data['authorCountry'].dropna()), "out of", data.shape[0])
print("Number of rows with region information:", len(data['authorRegion'].dropna()), "out of", data.shape[0])
print("Number of rows with user agent information:", len(data['authorUserAgent'].dropna()), "out of", data.shape[0])

"""##### Since, 22% of the data rows contain the above information, it would be difficult to analyse it's effect on virality and hence dropped.

##### Analyse Content Type
"""

data['contentType'].value_counts()

"""##### Since, more 95% has HTML content-type, it won't make a big difference over virality and hence dropped.

##### Drop columns
"""

data.drop(['authorUserAgent', 'authorCountry', 'authorRegion', 'authorSessionId', 'authorPersonId', 'url', 'contentType', 'eventType'], inplace=True, axis=1)
data.head()

"""#### Process Language, Title and Text

##### Counts of articles based on language
"""

data['lang'].value_counts()

"""##### Considering the scope of the project and above statistics, only English articles will be kept."""

data = data[data['lang'] == 'en']
data.head()

"""##### Clean and Tokenize Title + Text

- Clean Text by removing bad symbols and stopwords
- Reference: https://github.com/radonys/Reddit-Flair-Detector/blob/master/Jupyter%20Notebooks/Reddit_Flair_Detector.ipynb (my GitHub repository)
- Tokenize and Stemming text using Gensim modules
- Reference: 1) https://tedboy.github.io/nlps/generated/generated/gensim.utils.simple_preprocess.html, 2) https://radimrehurek.com/gensim/parsing/porter.html
"""

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)

    return text

def token_stem(text):

  tokens = simple_preprocess(text, deacc=True)
  
  porter_stemmer = PorterStemmer()
  stem_tokens = [porter_stemmer.stem(word) for word in tokens]

  return stem_tokens

data['title'] = data['title'].apply(clean_text)
data['text'] = data['text'].apply(clean_text)

data.drop(['lang'], axis = 1, inplace = True)

data.head()

"""#### Exploratory Analysis

##### Time Units vs Virality

Since, it's a time-series data, it becomes important to analyse the effects of time on virality and hence it's analysed against hours and minutes (in a day) and months.
"""

fig, axs = plt.subplots(3, 1)

y = list(data['VIRALITY'])
x = list(data['timestamp'].apply(datetime.fromtimestamp).dt.hour)
df = pd.DataFrame({'hour': x, 'virality': y})
df.sort_values('hour', inplace=True)

axs[0].plot(df['hour'], df['virality'])
axs[0].set_title('Hour vs Virality')

y = list(data['VIRALITY'])
x = list(data['timestamp'].apply(datetime.fromtimestamp).dt.minute)
df = pd.DataFrame({'minute': x, 'virality': y})
df.sort_values('minute', inplace=True)

axs[1].plot(df['minute'], df['virality'], 'tab:red')
axs[1].set_title('Minute vs Virality')

y = list(data['VIRALITY'])
x = list(data['timestamp'].apply(datetime.fromtimestamp).dt.month)
df = pd.DataFrame({'month': x, 'virality': y})
df.sort_values('month', inplace=True)

axs[2].plot(df['month'], df['virality'], 'tab:orange')
axs[2].set_title('Month vs Virality')

units = ['hour', 'minute', 'month']

for i, ax in enumerate(axs.flat):
    ax.set(xlabel=units[i], ylabel='virality')

fig.tight_layout()
fig.set_size_inches(5, 8)
fig.show()

"""Since the patterns don't give a clear idea (peaks are result of large values), hence time is not considered for the ML model."""

data.drop('timestamp', axis=1, inplace=True)

"""##### Correlation Matrix Between Numerical Values"""

corr_mat = data.corr()
sns.heatmap(corr_mat, annot=True)

plt.show()

"""As we can see from above, numerical variables are correlated with Virality in the following order (High to Low): Bookmark, Like, View, Follow and Comment Created, which is differing from the order of the coefficients as per the problem statement.

### Machine Learning Model

I would examine the model under two conditions. One with text and title information and one without considering it. The ML models considered for the project are: Linear Regression, Polynomial Regression, Random Forest, and Lasso.

#### Linear Regression
"""

def linear_reg(X_train, X_test, y_train, y_test):

  model = LinearRegression(n_jobs=-1)
    
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  print("R-Square Value:", r2_score(y_test, y_pred))
  print("Root Mean-Square Error:", mean_squared_error(y_test, y_pred, squared=False))
  print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred), "\n")

"""#### Polynomial Regression"""

def polynomial_reg(X_train, X_test, y_train, y_test):

  transformer = PolynomialFeatures(degree=2, include_bias=False)
  transformer.fit(X_train)

  x_ = transformer.transform(X_train)

  model = LinearRegression(n_jobs=-1)
    
  model.fit(x_, y_train)

  y_pred = model.predict(transformer.transform(X_test))

  print("R-Square Value:", r2_score(y_test, y_pred))
  print("Root Mean-Square Error:", mean_squared_error(y_test, y_pred, squared=False))
  print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred), "\n")

"""#### Random-Forest Regression"""

def rf_reg(X_train, X_test, y_train, y_test):

  model = RandomForestRegressor(n_jobs=-1, random_state=2020)
    
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  print("R-Square Value:", r2_score(y_test, y_pred))
  print("Root Mean-Square Error:", mean_squared_error(y_test, y_pred, squared=False))
  print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred), "\n")

"""#### Lasso"""

def lasso(X_train, X_test, y_train, y_test):

  model = Lasso(random_state=2020)
    
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  print("R-Square Value:", r2_score(y_test, y_pred))
  print("Root Mean-Square Error:", mean_squared_error(y_test, y_pred, squared=False))
  print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred), "\n")

"""#### Call for all models"""

def run_models(X_train, X_test, y_train, y_test):

  print("Linear Regression:")
  linear_reg(X_train, X_test, y_train, y_test)

  print("Polynomial Regression:")
  polynomial_reg(X_train, X_test, y_train, y_test)

  print("Random-Forest Regression:")
  rf_reg(X_train, X_test, y_train, y_test)

  print("Lasso:")
  lasso(X_train, X_test, y_train, y_test)

"""#### Data with Numerical Parameters"""

data_num = data[['BOOKMARK',	'COMMENT CREATED', 'FOLLOW',	'LIKE',	'VIEW',	'VIRALITY']]

"""##### Train-Test Split

80% Training Data, 20% Test Data
"""

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(data_num[['BOOKMARK',	'COMMENT CREATED', 'FOLLOW',	'LIKE',	'VIEW']], data_num['VIRALITY'], test_size=0.2, random_state = 2020)

"""#### Data with Numerical and Text Parameters"""

vectorizer = TfidfVectorizer(tokenizer=token_stem, max_features=100)

tfidf = vectorizer.fit_transform(data['title'] + data['text'])
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf)

data_all = data.copy()

data_all.reset_index(drop=True, inplace=True)
data_all.drop(['text', 'title'], axis = 1, inplace=True)

data_all = pd.concat([data_all, tfidf_df], axis = 1)

data_all.head()

"""##### Train-Test Split

80% Training Data, 20% Test Data
"""

cols = list(data_all.columns)
cols.remove('VIRALITY')

X_train_nt, X_test_nt, y_train_nt, y_test_nt = train_test_split(data_all[cols], data_all['VIRALITY'], test_size=0.2, random_state = 2020)

"""#### Run Models with Numerical Values"""

run_models(X_train_n, X_test_n, y_train_n, y_test_n)

"""#### Run Models with Numerical and Text Values"""

run_models(X_train_nt, X_test_nt, y_train_nt, y_test_nt)