dsc-sookmyung · moon0727 · Oct 1, 2024 · Oct 7, 2024 · Oct 10, 2024 · Oct 15, 2024
diff --git a/week1/[MLNovice]김문희_week1.ipynb b/week1/[MLNovice]김문희_week1.ipynb
diff --git a/week2/[MLNovice]김문희_week2.ipynb b/week2/[MLNovice]김문희_week2.ipynb
diff --git a/week3/[MLNovice]김문희_week3.ipynb b/week3/[MLNovice]김문희_week3.ipynb
diff --git a/week4/[MLNovice]김문희_week4.ipynb b/week4/[MLNovice]김문희_week4.ipynb
diff --git a/week5/[MLNovice]김문희_week5.ipynb b/week5/[MLNovice]김문희_week5.ipynb
diff --git a/week6/[MLNovice]김문희_week6.ipynb b/week6/[MLNovice]김문희_week6.ipynb
diff --git a/week7/[MLNovice]김문희_week7.ipynb b/week7/[MLNovice]김문희_week7.ipynb
diff --git a/week8/[MLNovice]김문희_week8.ipynb b/week8/[MLNovice]김문희_week8.ipynb
diff --git a/week9/[MLNovice]김문희_week9.ipynb b/week9/[MLNovice]김문희_week9.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyM7wYoX7EGWftCHtjZhK9Lg"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"WQVpe78Yfxwc"},"outputs":[],"source":["from google.colab import files\n","\n","# 파일 업로드\n","uploaded = files.upload()  # train.csv, test.csv 파일 선택"]},{"cell_type":"code","source":["import pandas as pd\n","\n","train_data = pd.read_csv('train.csv')\n","test_data = pd.read_csv('test.csv')\n","\n","print(train_data.head())\n","print(test_data.head())"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hnwQSEcbgMQz","executionInfo":{"status":"ok","timestamp":1733212366970,"user_tz":-540,"elapsed":1206,"user":{"displayName":"김문희","userId":"05132702975260878746"}},"outputId":"7cbfd6b7-5175-47ee-bbe1-9255d079e840"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["   id keyword location                                               text  \\\n","0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   \n","1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   \n","2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   \n","3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   \n","4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   \n","\n","   target  \n","0       1  \n","1       1  \n","2       1  \n","3       1  \n","4       1  \n","   id keyword location                                               text\n","0   0     NaN      NaN                 Just happened a terrible car crash\n","1   2     NaN      NaN  Heard about #earthquake is different cities, s...\n","2   3     NaN      NaN  there is a forest fire at spot pond, geese are...\n","3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires\n","4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan\n"]}]},{"cell_type":"code","source":["import re\n","\n","# 텍스트 전처리 함수\n","def clean_text(text):\n","    text = re.sub(r\"http\\S+\", \"\", text)  # URL 제거\n","    text = re.sub(r\"@\\w+\", \"\", text)    # 멘션 제거\n","    text = re.sub(r\"[^a-zA-Z\\s]\", \"\", text)  # 특수문자 제거\n","    text = text.lower().strip()        # 소문자 변환 및 공백 제거\n","    return text\n","\n","# train 데이터와 test 데이터의 텍스트 전처리\n","train_data['cleaned_text'] = train_data['text'].apply(clean_text)\n","test_data['cleaned_text'] = test_data['text'].apply(clean_text)"],"metadata":{"id":"0rWffUKugeGc","executionInfo":{"status":"ok","timestamp":1733212483396,"user_tz":-540,"elapsed":1169,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.model_selection import train_test_split\n","\n","# TfidfVectorizer를 사용하여 텍스트를 수치화\n","vectorizer = TfidfVectorizer(max_features=5000)\n","\n","X = vectorizer.fit_transform(train_data['cleaned_text'])\n","y = train_data['target']\n","\n","# 훈련 데이터와 검증 데이터로 분리\n","X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n"],"metadata":{"id":"-YkexoY_g6fw","executionInfo":{"status":"ok","timestamp":1733212509679,"user_tz":-540,"elapsed":7763,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["from sklearn.linear_model import LogisticRegression\n","from sklearn.metrics import accuracy_score, classification_report\n","\n","# 모델 훈련\n","model = LogisticRegression()\n","model.fit(X_train, y_train)\n","\n","# 검증 데이터 예측\n","val_predictions = model.predict(X_val)\n","\n","# 정확도 확인\n","print(\"Validation Accuracy:\", accuracy_score(y_val, val_predictions))\n","print(classification_report(y_val, val_predictions))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"duTVJs2ag_Ub","executionInfo":{"status":"ok","timestamp":1733212516588,"user_tz":-540,"elapsed":1150,"user":{"displayName":"김문희","userId":"05132702975260878746"}},"outputId":"e1356f71-213c-40dd-8c3d-070d700c3510"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Validation Accuracy: 0.8063033486539725\n","              precision    recall  f1-score   support\n","\n","           0       0.79      0.89      0.84       874\n","           1       0.83      0.69      0.75       649\n","\n","    accuracy                           0.81      1523\n","   macro avg       0.81      0.79      0.80      1523\n","weighted avg       0.81      0.81      0.80      1523\n","\n"]}]},{"cell_type":"code","source":["# 테스트 데이터 변환 및 예측\n","X_test = vectorizer.transform(test_data['cleaned_text'])\n","test_predictions = model.predict(X_test)\n"],"metadata":{"id":"WFBgUjzwhCns","executionInfo":{"status":"ok","timestamp":1733212531762,"user_tz":-540,"elapsed":700,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"_a46qFCFhGcA"},"execution_count":null,"outputs":[]}]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyM7wYoX7EGWftCHtjZhK9Lg"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"WQVpe78Yfxwc"},"outputs":[],"source":["from google.colab import files\n","\n","# 파일 업로드\n","uploaded = files.upload() # train.csv, test.csv 파일 선택"]},{"cell_type":"code","source":["import pandas as pd\n","\n","train_data = pd.read_csv('train.csv')\n","test_data = pd.read_csv('test.csv')\n","\n","print(train_data.head())\n","print(test_data.head())"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hnwQSEcbgMQz","executionInfo":{"status":"ok","timestamp":1733212366970,"user_tz":-540,"elapsed":1206,"user":{"displayName":"김문희","userId":"05132702975260878746"}},"outputId":"7cbfd6b7-5175-47ee-bbe1-9255d079e840"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":[" id keyword location text \\\n","0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n","1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n","2 5 NaN NaN All residents asked to 'shelter in place' are ... \n","3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n","4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n","\n"," target \n","0 1 \n","1 1 \n","2 1 \n","3 1 \n","4 1 \n"," id keyword location text\n","0 0 NaN NaN Just happened a terrible car crash\n","1 2 NaN NaN Heard about #earthquake is different cities, s...\n","2 3 NaN NaN there is a forest fire at spot pond, geese are...\n","3 9 NaN NaN Apocalypse lighting. #Spokane #wildfires\n","4 11 NaN NaN Typhoon Soudelor kills 28 in China and Taiwan\n"]}]},{"cell_type":"code","source":["import re\n","\n","# 텍스트 전처리 함수\n","def clean_text(text):\n"," text = re.sub(r\"http\\S+\", \"\", text) # URL 제거\n"," text = re.sub(r\"@\\w+\", \"\", text) # 멘션 제거\n"," text = re.sub(r\"[^a-zA-Z\\s]\", \"\", text) # 특수문자 제거\n"," text = text.lower().strip() # 소문자 변환 및 공백 제거\n"," return text\n","\n","# train 데이터와 test 데이터의 텍스트 전처리\n","train_data['cleaned_text'] = train_data['text'].apply(clean_text)\n","test_data['cleaned_text'] = test_data['text'].apply(clean_text)"],"metadata":{"id":"0rWffUKugeGc","executionInfo":{"status":"ok","timestamp":1733212483396,"user_tz":-540,"elapsed":1169,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.model_selection import train_test_split\n","\n","# TfidfVectorizer를 사용하여 텍스트를 수치화\n","vectorizer = TfidfVectorizer(max_features=5000)\n","\n","X = vectorizer.fit_transform(train_data['cleaned_text'])\n","y = train_data['target']\n","\n","# 훈련 데이터와 검증 데이터로 분리\n","X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n"],"metadata":{"id":"-YkexoY_g6fw","executionInfo":{"status":"ok","timestamp":1733212509679,"user_tz":-540,"elapsed":7763,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["from sklearn.linear_model import LogisticRegression\n","from sklearn.metrics import accuracy_score, classification_report\n","\n","# 모델 훈련\n","model = LogisticRegression()\n","model.fit(X_train, y_train)\n","\n","# 검증 데이터 예측\n","val_predictions = model.predict(X_val)\n","\n","# 정확도 확인\n","print(\"Validation Accuracy:\", accuracy_score(y_val, val_predictions))\n","print(classification_report(y_val, val_predictions))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"duTVJs2ag_Ub","executionInfo":{"status":"ok","timestamp":1733212516588,"user_tz":-540,"elapsed":1150,"user":{"displayName":"김문희","userId":"05132702975260878746"}},"outputId":"e1356f71-213c-40dd-8c3d-070d700c3510"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Validation Accuracy: 0.8063033486539725\n"," precision recall f1-score support\n","\n"," 0 0.79 0.89 0.84 874\n"," 1 0.83 0.69 0.75 649\n","\n"," accuracy 0.81 1523\n"," macro avg 0.81 0.79 0.80 1523\n","weighted avg 0.81 0.81 0.80 1523\n","\n"]}]},{"cell_type":"code","source":["# 테스트 데이터 변환 및 예측\n","X_test = vectorizer.transform(test_data['cleaned_text'])\n","test_predictions = model.predict(X_test)\n"],"metadata":{"id":"WFBgUjzwhCns","executionInfo":{"status":"ok","timestamp":1733212531762,"user_tz":-540,"elapsed":700,"user":{"displayName":"김문희","userId":"05132702975260878746"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"_a46qFCFhGcA"},"execution_count":null,"outputs":[]}]}