abdulfatir · Aavishkar04 · Oct 13, 2024
diff --git a/code/xgboost.py b/code/xgboost.py
@@ -1,13 +1,12 @@
+
 import utils
 import random
 import numpy as np
 from xgboost import XGBClassifier
 from scipy.sparse import lil_matrix
 from sklearn.feature_extraction.text import TfidfTransformer
 
-# Performs classification using XGBoost.
-
-
+# Configuration Variables
 FREQ_DIST_FILE = '../train-processed-freqdist.pkl'
 BI_FREQ_DIST_FILE = '../train-processed-freqdist-bi.pkl'
 TRAIN_PROCESSED_FILE = '../train-processed.csv'
@@ -18,42 +17,39 @@
 USE_BIGRAMS = True
 if USE_BIGRAMS:
     BIGRAM_SIZE = 10000
-    VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
+    VOCAB_SIZE += BIGRAM_SIZE  # Combined Unigram and Bigram Size
 FEAT_TYPE = 'frequency'
 
-
+# Helper function to extract unigrams and bigrams
 def get_feature_vector(tweet):
     uni_feature_vector = []
     bi_feature_vector = []
     words = tweet.split()
-    for i in xrange(len(words) - 1):
+    for i in range(len(words) - 1):  # xrange replaced with range
         word = words[i]
         next_word = words[i + 1]
         if unigrams.get(word):
             uni_feature_vector.append(word)
-        if USE_BIGRAMS:
-            if bigrams.get((word, next_word)):
-                bi_feature_vector.append((word, next_word))
-    if len(words) >= 1:
-        if unigrams.get(words[-1]):
-            uni_feature_vector.append(words[-1])
+        if USE_BIGRAMS and bigrams.get((word, next_word)):
+            bi_feature_vector.append((word, next_word))
+    if len(words) >= 1 and unigrams.get(words[-1]):
+        uni_feature_vector.append(words[-1])
     return uni_feature_vector, bi_feature_vector
 
-
+# Function to extract features from the tweets dataset
 def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'):
     num_batches = int(np.ceil(len(tweets) / float(batch_size)))
-    for i in xrange(num_batches):
+    for i in range(num_batches):  # xrange replaced with range
         batch = tweets[i * batch_size: (i + 1) * batch_size]
         features = lil_matrix((batch_size, VOCAB_SIZE))
         labels = np.zeros(batch_size)
         for j, tweet in enumerate(batch):
             if test_file:
-                tweet_words = tweet[1][0]
-                tweet_bigrams = tweet[1][1]
+                tweet_words, tweet_bigrams = tweet[1]
             else:
-                tweet_words = tweet[2][0]
-                tweet_bigrams = tweet[2][1]
+                tweet_words, tweet_bigrams = tweet[2]
                 labels[j] = tweet[1]
+
             if feat_type == 'presence':
                 tweet_words = set(tweet_words)
                 tweet_bigrams = set(tweet_bigrams)
@@ -68,26 +64,15 @@ def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence
                         features[j, UNIGRAM_SIZE + idx] += 1
         yield features, labels
 
-
+# Apply TF-IDF transformation to features
 def apply_tf_idf(X):
     transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
-    transformer.fit(X)
-    return transformer
-
+    return transformer.fit_transform(X)  # Direct transformation after fitting
 
+# Process and return tweets with features
 def process_tweets(csv_file, test_file=True):
-    """Returns a list of tuples of type (tweet_id, feature_vector)
-            or (tweet_id, sentiment, feature_vector)
-
-    Args:
-        csv_file (str): Name of processed csv file generated by preprocess.py
-        test_file (bool, optional): If processing test file
-
-    Returns:
-        list: Of tuples
-    """
     tweets = []
-    print 'Generating feature vectors'
+    print('Generating feature vectors')  # Print format changed
     with open(csv_file, 'r') as csv:
         lines = csv.readlines()
         total = len(lines)
@@ -102,64 +87,72 @@ def process_tweets(csv_file, test_file=True):
             else:
                 tweets.append((tweet_id, int(sentiment), feature_vector))
             utils.write_status(i + 1, total)
-    print '\n'
+    print('\nProcessing complete')
     return tweets
 
-
 if __name__ == '__main__':
     np.random.seed(1337)
     unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
     if USE_BIGRAMS:
         bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
+
     tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
+
     if TRAIN:
         train_tweets, val_tweets = utils.split_data(tweets)
     else:
         random.shuffle(tweets)
         train_tweets = tweets
-    del tweets
-    print 'Extracting features & training batches'
-    clf = XGBClassifier(max_depth=25, silent=False, n_estimators=400)
+
+    del tweets  # Free up memory
+
+    print('Extracting features & training batches')
+    clf = XGBClassifier(max_depth=25, verbosity=1, n_estimators=400)  # Changed 'silent' to 'verbosity'
+
     batch_size = len(train_tweets)
     i = 1
     n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
+
     for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
         utils.write_status(i, n_train_batches)
         i += 1
         if FEAT_TYPE == 'frequency':
-            tfidf = apply_tf_idf(training_set_X)
-            training_set_X = tfidf.transform(training_set_X)
+            training_set_X = apply_tf_idf(training_set_X)
         clf.fit(training_set_X, training_set_y)
-    print '\n'
-    print 'Testing'
+
+    print('\nTesting model performance')
     if TRAIN:
         correct, total = 0, len(val_tweets)
         i = 1
         batch_size = len(val_tweets)
         n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
+
         for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
             if FEAT_TYPE == 'frequency':
-                val_set_X = tfidf.transform(val_set_X)
+                val_set_X = apply_tf_idf(val_set_X)
             prediction = clf.predict(val_set_X)
             correct += np.sum(prediction == val_set_y)
             utils.write_status(i, n_val_batches)
             i += 1
-        print '\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)
+
+        accuracy = correct * 100. / total
+        print(f'\nCorrect: {correct}/{total} = {accuracy:.4f}%')
     else:
         del train_tweets
         test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
-        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
         predictions = np.array([])
-        print 'Predicting batches'
+        print('Predicting test set batches')
         i = 1
+        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
+
         for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE):
             if FEAT_TYPE == 'frequency':
-                test_set_X = tfidf.transform(test_set_X)
+                test_set_X = apply_tf_idf(test_set_X)
             prediction = clf.predict(test_set_X)
             predictions = np.concatenate((predictions, prediction))
             utils.write_status(i, n_test_batches)
             i += 1
-        predictions = [(str(j), int(predictions[j]))
-                       for j in range(len(test_tweets))]
-        utils.save_results_to_csv(predictions, 'xgboost.csv')
-        print '\nSaved to xgboost.csv'
+
+        predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))]
+        utils.save_results_to_csv(predictions, 'xgboost_predictions.csv')
+        print('\nResults saved to xgboost_predictions.csv')