diff --git a/requirements.tsx b/requirements.txt similarity index 94% rename from requirements.tsx rename to requirements.txt index 71f6223..5a418a5 100644 --- a/requirements.tsx +++ b/requirements.txt @@ -1,7 +1,8 @@ -pandas >= 2.2.2 -seaborn >= 0.13.2 -sentence-transformers >= 3.0.1 -tensorflow >= 2.17.0 -tf_keras >= 2.17.0 -torch >= 2.3.1 -transformers >= 4.43.1 +pandas >= 2.2.2 +seaborn >= 0.13.2 +sentence-transformers >= 3.0.1 +tensorflow >= 2.17.0 +tf_keras >= 2.17.0 +torch >= 2.3.1 +transformers >= 4.43.1 +doxa-cli \ No newline at end of file diff --git a/submission_bag_of_words/competition.py b/submission_bag_of_words/competition.py new file mode 100644 index 0000000..5679f03 --- /dev/null +++ b/submission_bag_of_words/competition.py @@ -0,0 +1,27 @@ +import os +from typing import Any, Generator + +import pandas as pd + + +class BaseEvaluator: + def predict(self, df: pd.DataFrame) -> Generator[int, Any, None]: + raise NotImplementedError + + def run(self): + stream_directory = os.environ.get("DOXA_STREAMS") + + in_file = f"{stream_directory}/in" if stream_directory else "train.csv" + out_file = f"{stream_directory}/out" if stream_directory else "predictions.csv" + + with ( + open(in_file, "r", encoding="utf8") as r, + open(out_file, "w") as w, + ): + w.write(f"OK\n") + w.flush() + + df = pd.read_csv(r) + for prediction in self.predict(df): + w.write(f"{prediction}\n") + w.flush() diff --git a/submission_bag_of_words/doxa.yaml b/submission_bag_of_words/doxa.yaml new file mode 100644 index 0000000..c440078 --- /dev/null +++ b/submission_bag_of_words/doxa.yaml @@ -0,0 +1,4 @@ +competition: harmony-matching +environment: cpu +language: python +entrypoint: run.py diff --git a/submission_bag_of_words/run.py b/submission_bag_of_words/run.py new file mode 100644 index 0000000..6bf9a76 --- /dev/null +++ b/submission_bag_of_words/run.py @@ -0,0 +1,68 @@ +import pathlib +import sys +from typing import Any, Generator + +import pandas as pd + +directory = pathlib.Path(__file__).parent +sys.path.insert(0, str(directory.resolve())) + +from competition import BaseEvaluator +from sentence_transformers import SentenceTransformer + +################################################################################# +# # +# This file gets run when you submit your work for evaluation on the DOXA # +# AI platform. Modify the predict() method to implement your own strategy! # +# # +################################################################################# + +import re + +re_tok = re.compile(r"(?i)([a-z']+)") + +def to_bag_of_words(sentence): + return set(re_tok.findall(sentence.lower())) + +def jaccard_similarity(bag_of_words_1: set, bag_of_words_2: set): + tokens_in_any_sentence = len(bag_of_words_1.union(bag_of_words_2)) + tokens_in_both_sentences = len(bag_of_words_1.intersection(bag_of_words_2)) + if tokens_in_both_sentences == 0: + return 0 + ratio = tokens_in_both_sentences / tokens_in_any_sentence + return ratio + +class Evaluator(BaseEvaluator): + def predict(self, df: pd.DataFrame) -> Generator[int, Any, None]: + """Write all the code you need to generate predictions for the test set here! + + Args: + df (pd.DataFrame): This is a dataframe containing `sentence_1` and `sentence_`, just as in the training data + + Yields: + Generator[int, Any, None]: For each pair of sentences in `df`, yield your similarity prediction, + which should be an integer in the range [0, 100]. + """ + + #model = SentenceTransformer(str(directory / "model")) + + sentences = list(set(df["sentence_1"]) | set(df["sentence_2"])) + + bags_of_words = {} + for sentence in sentences: + bags_of_words[sentence] = to_bag_of_words(sentence) + + bags_of_words_1 = df["sentence_1"].map(bags_of_words) + bags_of_words_2 = df["sentence_2"].map(bags_of_words) + predictions = [0] * len(df) + for idx in range(len(df)): + predictions[idx] = jaccard_similarity(bags_of_words_1.iloc[idx], bags_of_words_2.iloc[idx]) + df["prediction"] = predictions + df["prediction"] = (100 * df["prediction"]).apply(int).clip(0, 100) + + for _, row in df.iterrows(): + yield row["prediction"] + + +if __name__ == "__main__": + Evaluator().run() diff --git a/tw_test.py b/tw_test.py new file mode 100644 index 0000000..b797d2b --- /dev/null +++ b/tw_test.py @@ -0,0 +1,9 @@ +from submission_bag_of_words.run import Evaluator +import pandas as pd + +df = pd.DataFrame() +df["sentence_1"] = ["I feel sad", "I feel happy"] +df["sentence_2"] = ["I feel sad", "I feel sad"] + +for result in Evaluator().predict(df): + print (result) \ No newline at end of file