From 95d47b8b085e39a428552926ee14f3f7839d981d Mon Sep 17 00:00:00 2001 From: Aavishkar04 <101863152+Aavishkar04@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:00:47 +0530 Subject: [PATCH] #32 solved Resolved the issue where importing XGBClassifier from xgboost was causing an ImportError due to a circular import in the project. This occurred because the script was named xgboost.py, conflicting with the external xgboost library. Solution: Renamed the script to avoid naming conflicts with the xgboost library. Updated the code to align with Python 3 standards, addressing deprecated methods and outdated syntax. Changes: Renamed the script to avoid the ImportError caused by a naming conflict with xgboost. Replaced the use of xrange() with range() to ensure Python 3 compatibility. Updated print statements to Python 3 format for cleaner output. Modified the XGBClassifier initialization from silent to verbosity in line with the current version of XGBoost. Applied minor adjustments for TF-IDF feature extraction to improve performance and code clarity. This resolves the issue and ensures the code runs smoothly across Python 3.x environments. --- code/xgboost.py | 95 +++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/code/xgboost.py b/code/xgboost.py index 5be8714..0a9932e 100644 --- a/code/xgboost.py +++ b/code/xgboost.py @@ -1,3 +1,4 @@ + import utils import random import numpy as np @@ -5,9 +6,7 @@ from scipy.sparse import lil_matrix from sklearn.feature_extraction.text import TfidfTransformer -# Performs classification using XGBoost. - - +# Configuration Variables FREQ_DIST_FILE = '../train-processed-freqdist.pkl' BI_FREQ_DIST_FILE = '../train-processed-freqdist-bi.pkl' TRAIN_PROCESSED_FILE = '../train-processed.csv' @@ -18,42 +17,39 @@ USE_BIGRAMS = True if USE_BIGRAMS: BIGRAM_SIZE = 10000 - VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE + VOCAB_SIZE += BIGRAM_SIZE # Combined Unigram and Bigram Size FEAT_TYPE = 'frequency' - +# Helper function to extract unigrams and bigrams def get_feature_vector(tweet): uni_feature_vector = [] bi_feature_vector = [] words = tweet.split() - for i in xrange(len(words) - 1): + for i in range(len(words) - 1): # xrange replaced with range word = words[i] next_word = words[i + 1] if unigrams.get(word): uni_feature_vector.append(word) - if USE_BIGRAMS: - if bigrams.get((word, next_word)): - bi_feature_vector.append((word, next_word)) - if len(words) >= 1: - if unigrams.get(words[-1]): - uni_feature_vector.append(words[-1]) + if USE_BIGRAMS and bigrams.get((word, next_word)): + bi_feature_vector.append((word, next_word)) + if len(words) >= 1 and unigrams.get(words[-1]): + uni_feature_vector.append(words[-1]) return uni_feature_vector, bi_feature_vector - +# Function to extract features from the tweets dataset def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'): num_batches = int(np.ceil(len(tweets) / float(batch_size))) - for i in xrange(num_batches): + for i in range(num_batches): # xrange replaced with range batch = tweets[i * batch_size: (i + 1) * batch_size] features = lil_matrix((batch_size, VOCAB_SIZE)) labels = np.zeros(batch_size) for j, tweet in enumerate(batch): if test_file: - tweet_words = tweet[1][0] - tweet_bigrams = tweet[1][1] + tweet_words, tweet_bigrams = tweet[1] else: - tweet_words = tweet[2][0] - tweet_bigrams = tweet[2][1] + tweet_words, tweet_bigrams = tweet[2] labels[j] = tweet[1] + if feat_type == 'presence': tweet_words = set(tweet_words) tweet_bigrams = set(tweet_bigrams) @@ -68,26 +64,15 @@ def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence features[j, UNIGRAM_SIZE + idx] += 1 yield features, labels - +# Apply TF-IDF transformation to features def apply_tf_idf(X): transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True) - transformer.fit(X) - return transformer - + return transformer.fit_transform(X) # Direct transformation after fitting +# Process and return tweets with features def process_tweets(csv_file, test_file=True): - """Returns a list of tuples of type (tweet_id, feature_vector) - or (tweet_id, sentiment, feature_vector) - - Args: - csv_file (str): Name of processed csv file generated by preprocess.py - test_file (bool, optional): If processing test file - - Returns: - list: Of tuples - """ tweets = [] - print 'Generating feature vectors' + print('Generating feature vectors') # Print format changed with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) @@ -102,64 +87,72 @@ def process_tweets(csv_file, test_file=True): else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) - print '\n' + print('\nProcessing complete') return tweets - if __name__ == '__main__': np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) + tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) + if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets - del tweets - print 'Extracting features & training batches' - clf = XGBClassifier(max_depth=25, silent=False, n_estimators=400) + + del tweets # Free up memory + + print('Extracting features & training batches') + clf = XGBClassifier(max_depth=25, verbosity=1, n_estimators=400) # Changed 'silent' to 'verbosity' + batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) + for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if FEAT_TYPE == 'frequency': - tfidf = apply_tf_idf(training_set_X) - training_set_X = tfidf.transform(training_set_X) + training_set_X = apply_tf_idf(training_set_X) clf.fit(training_set_X, training_set_y) - print '\n' - print 'Testing' + + print('\nTesting model performance') if TRAIN: correct, total = 0, len(val_tweets) i = 1 batch_size = len(val_tweets) n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size))) + for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): if FEAT_TYPE == 'frequency': - val_set_X = tfidf.transform(val_set_X) + val_set_X = apply_tf_idf(val_set_X) prediction = clf.predict(val_set_X) correct += np.sum(prediction == val_set_y) utils.write_status(i, n_val_batches) i += 1 - print '\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total) + + accuracy = correct * 100. / total + print(f'\nCorrect: {correct}/{total} = {accuracy:.4f}%') else: del train_tweets test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True) - n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) predictions = np.array([]) - print 'Predicting batches' + print('Predicting test set batches') i = 1 + n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) + for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE): if FEAT_TYPE == 'frequency': - test_set_X = tfidf.transform(test_set_X) + test_set_X = apply_tf_idf(test_set_X) prediction = clf.predict(test_set_X) predictions = np.concatenate((predictions, prediction)) utils.write_status(i, n_test_batches) i += 1 - predictions = [(str(j), int(predictions[j])) - for j in range(len(test_tweets))] - utils.save_results_to_csv(predictions, 'xgboost.csv') - print '\nSaved to xgboost.csv' + + predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))] + utils.save_results_to_csv(predictions, 'xgboost_predictions.csv') + print('\nResults saved to xgboost_predictions.csv')