From ee6a31972b0aac6e3ce4ffda382f6751c3ce92cb Mon Sep 17 00:00:00 2001 From: Maximilian Kany Date: Fri, 15 Aug 2025 11:38:46 +0200 Subject: [PATCH] first version of the plot --- README.md | 18 - comparison.py | 140 ----- games_march2025_cleaned.csv | 3 - games_march2025_cleaned/BernoulliNB.txt | 21 - .../DecisionTreeClassifier.txt | 21 - games_march2025_cleaned/GaussianNB.txt | 21 - .../GradientBoostingClassifier.txt | 21 - games_march2025_cleaned/LinearSVC-i5000.txt | 21 - .../LogisticRegression-i1000.txt | 21 - .../LogisticRegression-i10000.txt | 21 - .../MLPClassifier-i10000.txt | 21 - games_march2025_cleaned/MultinomialNB.txt | 21 - .../RandomForestClassifier.txt | 21 - games_march2025_cleaned/SVC-RBF-i10000.txt | 21 - games_march2025_cleaned_10k.csv | 3 - games_march2025_cleaned_10k/BernoulliNB.txt | 21 - .../DecisionTreeClassifier.txt | 21 - games_march2025_cleaned_10k/GaussianNB.txt | 21 - .../GradientBoostingClassifier.txt | 21 - .../LinearSVC-i5000.txt | 21 - .../LogisticRegression-i1000.txt | 21 - .../LogisticRegression-i10000.txt | 21 - .../MLPClassifier-i10000.txt | 21 - games_march2025_cleaned_10k/MultinomialNB.txt | 21 - .../RandomForestClassifier.txt | 21 - .../SVC-RBF-i10000.txt | 21 - games_march2025_cleaned_2k.csv | 3 - games_march2025_cleaned_2k/BernoulliNB.txt | 21 - .../DecisionTreeClassifier.txt | 21 - games_march2025_cleaned_2k/GaussianNB.txt | 21 - .../GradientBoostingClassifier.txt | 21 - .../LinearSVC-i5000.txt | 21 - .../LogisticRegression-i1000.txt | 21 - .../LogisticRegression-i10000.txt | 21 - .../MLPClassifier-i10000.txt | 21 - games_march2025_cleaned_2k/MultinomialNB.txt | 21 - .../RandomForestClassifier.txt | 21 - games_march2025_cleaned_2k/SVC-RBF-i10000.txt | 21 - notebook.ipynb | 530 ------------------ plot_maker.py | 38 ++ test_script.py | 133 ----- 41 files changed, 38 insertions(+), 1523 deletions(-) delete mode 100644 README.md delete mode 100644 comparison.py delete mode 100644 games_march2025_cleaned.csv delete mode 100644 games_march2025_cleaned/BernoulliNB.txt delete mode 100644 games_march2025_cleaned/DecisionTreeClassifier.txt delete mode 100644 games_march2025_cleaned/GaussianNB.txt delete mode 100644 games_march2025_cleaned/GradientBoostingClassifier.txt delete mode 100644 games_march2025_cleaned/LinearSVC-i5000.txt delete mode 100644 games_march2025_cleaned/LogisticRegression-i1000.txt delete mode 100644 games_march2025_cleaned/LogisticRegression-i10000.txt delete mode 100644 games_march2025_cleaned/MLPClassifier-i10000.txt delete mode 100644 games_march2025_cleaned/MultinomialNB.txt delete mode 100644 games_march2025_cleaned/RandomForestClassifier.txt delete mode 100644 games_march2025_cleaned/SVC-RBF-i10000.txt delete mode 100644 games_march2025_cleaned_10k.csv delete mode 100644 games_march2025_cleaned_10k/BernoulliNB.txt delete mode 100644 games_march2025_cleaned_10k/DecisionTreeClassifier.txt delete mode 100644 games_march2025_cleaned_10k/GaussianNB.txt delete mode 100644 games_march2025_cleaned_10k/GradientBoostingClassifier.txt delete mode 100644 games_march2025_cleaned_10k/LinearSVC-i5000.txt delete mode 100644 games_march2025_cleaned_10k/LogisticRegression-i1000.txt delete mode 100644 games_march2025_cleaned_10k/LogisticRegression-i10000.txt delete mode 100644 games_march2025_cleaned_10k/MLPClassifier-i10000.txt delete mode 100644 games_march2025_cleaned_10k/MultinomialNB.txt delete mode 100644 games_march2025_cleaned_10k/RandomForestClassifier.txt delete mode 100644 games_march2025_cleaned_10k/SVC-RBF-i10000.txt delete mode 100644 games_march2025_cleaned_2k.csv delete mode 100644 games_march2025_cleaned_2k/BernoulliNB.txt delete mode 100644 games_march2025_cleaned_2k/DecisionTreeClassifier.txt delete mode 100644 games_march2025_cleaned_2k/GaussianNB.txt delete mode 100644 games_march2025_cleaned_2k/GradientBoostingClassifier.txt delete mode 100644 games_march2025_cleaned_2k/LinearSVC-i5000.txt delete mode 100644 games_march2025_cleaned_2k/LogisticRegression-i1000.txt delete mode 100644 games_march2025_cleaned_2k/LogisticRegression-i10000.txt delete mode 100644 games_march2025_cleaned_2k/MLPClassifier-i10000.txt delete mode 100644 games_march2025_cleaned_2k/MultinomialNB.txt delete mode 100644 games_march2025_cleaned_2k/RandomForestClassifier.txt delete mode 100644 games_march2025_cleaned_2k/SVC-RBF-i10000.txt delete mode 100644 notebook.ipynb create mode 100644 plot_maker.py delete mode 100644 test_script.py diff --git a/README.md b/README.md deleted file mode 100644 index 4417fc3..0000000 --- a/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Machine Learning Project – Summer Semester 2025 - -This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program. - -## Objective - -We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions. -As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle. - -## Dataset - -We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data). - -## Contributors - -- Maximilian Kany -- Florian Speicher -- Tim Wall \ No newline at end of file diff --git a/comparison.py b/comparison.py deleted file mode 100644 index fcced39..0000000 --- a/comparison.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import numpy as np -import pandas as pd -from sklearn import set_config - -from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import FunctionTransformer - -from sklearn.preprocessing import MultiLabelBinarizer -import ast - - -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.multioutput import MultiOutputClassifier -from sklearn.metrics import classification_report -from sklearn.model_selection import train_test_split -from sklearn.datasets import load_iris -from sklearn.metrics import accuracy_score, classification_report -from sklearn.svm import SVC, LinearSVC -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier -from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.neural_network import MLPClassifier - - -set_config(transform_output="pandas") # dataframe supremacy - -def prepDataset(dataset): #returns X_train, X_test, y_train, y_test - dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",") - # desc, genres, tags - column_transformer = ColumnTransformer([ - # merge all descriptions - ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")), - ['detailed_description', 'about_the_game', 'short_description']), - ('pass', 'passthrough', ['genres']),#, 'tags' - ], - verbose_feature_names_out=False - ) - dataset = column_transformer.fit_transform(dataset) - - - - #### SET MISSING VALUES - print("SETMISS") - # Setting missing numeric values to the mean - dataset.fillna(dataset.mean(numeric_only=True), inplace=True) - # Setting missing text values to 'Unknown' - dataset.fillna('', inplace=True) - # Setting missing values in other columns to NaN - dataset.dropna(inplace=True) - - ##### STRUCTURIZE GENRES to onehot - #serialize array - dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) - #print(dataset['genres']) # in py but not yet onehotenc - - # MultiLabelBinarizer does onehotenc for arrays - mlb_genres = MultiLabelBinarizer() - genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) - #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later - - genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) - #print(genres_df) - #dataset = pd.concat([dataset, genres_df], axis=1) - #print(dataset) - - - #### convert text to bag of words - - ## Count vs Tfidf vectorizer - vectorizer = TfidfVectorizer() - tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix - tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) - #print(tfidf_df) - - - ##### MODEL - print("MODEL") - - - X = tfidf_df - y = genres_df - # cleanup datapoints that dont have a target value (all target columns are 0) - mask = y.sum(axis=1).map(lambda x: x > 0) - #print((mask == False).sum()) #31 cases with all target columns 0 - X_clean = X[mask] - y_clean = y[mask] - - # Split dataset - return train_test_split(X_clean, y_clean, random_state=0) - -def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report - multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) - - # model training - multi_target_clf.fit(X_train, y_train) - - # predict against test data - y_pred = multi_target_clf.predict(X_test) - return classification_report(y_test, y_pred, zero_division=0.0) - -datasets = [ - 'games_march2025_cleaned_2k.csv', - 'games_march2025_cleaned_10k.csv', - 'games_march2025_cleaned.csv' -] - -estimators = { - "LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0), - "LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0), - "LinearSVC-i5000": LinearSVC(max_iter=5000), - "SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000), - "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), - "RandomForestClassifier": RandomForestClassifier(random_state=0), - "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), - "GaussianNB": GaussianNB(), - "MultinomialNB": MultinomialNB(), - "BernoulliNB": BernoulliNB(), - "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0), -} - -for dataset in datasets: - print("-" * 60) - print("dataset -> " + dataset) - print("-" * 60) - print("mkdir") - folder = dataset.split(".csv")[0] - if not os.path.isdir(folder): - os.mkdir(folder) - X_train, X_test, y_train, y_test = prepDataset(dataset) - for esti in estimators: - compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can - print("open") - f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") - f.write(compari) - print("write") - f.close() - print("close") \ No newline at end of file diff --git a/games_march2025_cleaned.csv b/games_march2025_cleaned.csv deleted file mode 100644 index 4d046d1..0000000 --- a/games_march2025_cleaned.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04d8df2778aaa8f8b575934b7072d55224d37a2ded23e8261583f0fcf668dfab -size 468641107 diff --git a/games_march2025_cleaned/BernoulliNB.txt b/games_march2025_cleaned/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned/DecisionTreeClassifier.txt b/games_march2025_cleaned/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned/GaussianNB.txt b/games_march2025_cleaned/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned/GradientBoostingClassifier.txt b/games_march2025_cleaned/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned/LinearSVC-i5000.txt b/games_march2025_cleaned/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned/LogisticRegression-i1000.txt b/games_march2025_cleaned/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned/LogisticRegression-i10000.txt b/games_march2025_cleaned/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned/MLPClassifier-i10000.txt b/games_march2025_cleaned/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned/MultinomialNB.txt b/games_march2025_cleaned/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned/RandomForestClassifier.txt b/games_march2025_cleaned/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned/SVC-RBF-i10000.txt b/games_march2025_cleaned/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/games_march2025_cleaned_10k.csv b/games_march2025_cleaned_10k.csv deleted file mode 100644 index 2c3c073..0000000 --- a/games_march2025_cleaned_10k.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12cf598a6e41d83cfa9c16e99d4d9578cb4ee7c3594fae9f9b921772887a08d7 -size 68658136 diff --git a/games_march2025_cleaned_10k/BernoulliNB.txt b/games_march2025_cleaned_10k/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned_10k/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned_10k/DecisionTreeClassifier.txt b/games_march2025_cleaned_10k/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned_10k/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned_10k/GaussianNB.txt b/games_march2025_cleaned_10k/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned_10k/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned_10k/GradientBoostingClassifier.txt b/games_march2025_cleaned_10k/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned_10k/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned_10k/LinearSVC-i5000.txt b/games_march2025_cleaned_10k/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned_10k/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned_10k/LogisticRegression-i1000.txt b/games_march2025_cleaned_10k/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_10k/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_10k/LogisticRegression-i10000.txt b/games_march2025_cleaned_10k/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_10k/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_10k/MLPClassifier-i10000.txt b/games_march2025_cleaned_10k/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned_10k/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned_10k/MultinomialNB.txt b/games_march2025_cleaned_10k/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned_10k/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned_10k/RandomForestClassifier.txt b/games_march2025_cleaned_10k/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned_10k/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned_10k/SVC-RBF-i10000.txt b/games_march2025_cleaned_10k/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned_10k/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/games_march2025_cleaned_2k.csv b/games_march2025_cleaned_2k.csv deleted file mode 100644 index 806e982..0000000 --- a/games_march2025_cleaned_2k.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75ba38404995149bcb8e5a321459f73b4adf58597f85bab396dd054cc78c145d -size 15455174 diff --git a/games_march2025_cleaned_2k/BernoulliNB.txt b/games_march2025_cleaned_2k/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned_2k/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned_2k/DecisionTreeClassifier.txt b/games_march2025_cleaned_2k/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned_2k/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned_2k/GaussianNB.txt b/games_march2025_cleaned_2k/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned_2k/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned_2k/GradientBoostingClassifier.txt b/games_march2025_cleaned_2k/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned_2k/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned_2k/LinearSVC-i5000.txt b/games_march2025_cleaned_2k/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned_2k/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned_2k/LogisticRegression-i1000.txt b/games_march2025_cleaned_2k/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_2k/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_2k/LogisticRegression-i10000.txt b/games_march2025_cleaned_2k/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_2k/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_2k/MLPClassifier-i10000.txt b/games_march2025_cleaned_2k/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned_2k/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned_2k/MultinomialNB.txt b/games_march2025_cleaned_2k/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned_2k/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned_2k/RandomForestClassifier.txt b/games_march2025_cleaned_2k/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned_2k/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned_2k/SVC-RBF-i10000.txt b/games_march2025_cleaned_2k/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned_2k/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/notebook.ipynb b/notebook.ipynb deleted file mode 100644 index 3307ceb..0000000 --- a/notebook.ipynb +++ /dev/null @@ -1,530 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a3a7634f", - "metadata": {}, - "source": [ - "# Machine Learning project in SoSe 2025 at HTW Saar\n", - "## Idea\n", - "The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n", - "\n", - "## Dataset\n", - "For our project we use a Steam Dataset provided on moodle, since it has all information we plan on using.\n", - "The Dataset has been cut to only 2000 data points to be runnable on weaker devices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3116b75f", - "metadata": { - "jupyter": { - "is_executing": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " appid name release_date required_age price dlc_count \\\n", - "0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n", - "\n", - " detailed_description \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "\n", - " about_the_game \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "\n", - " short_description reviews ... \\\n", - "0 For over two decades, Counter-Strike has offer... NaN ... \n", - "\n", - " average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n", - "0 879 5174 350 \n", - "\n", - " discount peak_ccu tags \\\n", - "0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n", - "\n", - " pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n", - "0 86 8632939 82 96473 \n", - "\n", - "[1 rows x 47 columns]\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn import set_config\n", - "\n", - "set_config(transform_output=\"pandas\")\n", - "\n", - "dataset = pd.read_csv(\"./games_march2025_cleaned_2k.csv\",sep=\",\")\n", - "print(dataset.head(1))" - ] - }, - { - "cell_type": "markdown", - "id": "cba9750a", - "metadata": {}, - "source": [ - "## Preparation of the Dataset\n", - "### Removing Uniques\n", - "We would remove the following features from the Training-Set as they can/could uniquely identify a datapoint, but we don't as they will be removed in the next step anyway\n", - "- AppId\n", - "- Name of the Game\n", - "- Realease Date\n", - "- Reviews\n", - "- Header Image\n", - "- Website\n", - "- Support URL\n", - "- Support Email\n", - "- MetaCritic URL\n", - "- Developer\n", - "- Publisher\n", - "- Screenshots\n", - "- Movies\n", - "- Estimated Owners" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d159117377f3633c", - "metadata": {}, - "outputs": [], - "source": [ - "#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n", - "#print(dataset.head())" - ] - }, - { - "cell_type": "markdown", - "id": "e1b28ddd69f1e9a6", - "metadata": {}, - "source": [ - "## Hold onto necessary information\n", - "Our model should turn a textual description of a game into its genre. For that we need all the textual information a game has, as well as the genres of the game.\n", - "We use a ColumnTransformer to drop all unnecessary lines, merge all descriptions of a game into one big description and hold onto the genres\n", - "\n", - "It is important to use ``verbose_feature_names_out=False`` so the feature names don't get changed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "986fbb31a7ae0d8b", - "metadata": { - "jupyter": { - "is_executing": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " desc \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n", - "2 The most-played game on Steam. Every day, mill... \n", - "3 When a young street hustler, a retired bank ro... \n", - "4 Edition Comparison Ultimate Edition The Tom Cl... \n", - "\n", - " genres \n", - "0 ['Action', 'Free To Play'] \n", - "1 ['Action', 'Adventure', 'Massively Multiplayer... \n", - "2 ['Action', 'Strategy', 'Free To Play'] \n", - "3 ['Action', 'Adventure'] \n", - "4 ['Action'] \n" - ] - } - ], - "source": [ - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import FunctionTransformer\n", - "\n", - "# desc, genres\n", - "column_transformer = ColumnTransformer([\n", - " # merge all descriptions\n", - " ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name=\"desc\")),\n", - " ['detailed_description', 'about_the_game', 'short_description']),\n", - " ('pass', 'passthrough', ['genres']),\n", - " ],\n", - " verbose_feature_names_out=False\n", - ")\n", - "dataset = column_transformer.fit_transform(dataset)\n", - "print(dataset.head())" - ] - }, - { - "cell_type": "markdown", - "id": "f9b89c0645811564", - "metadata": {}, - "source": [ - "### Adding missing Information\n", - "Some Games might not have any descriptions. For these we Input an Empty String\n", - "**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44239f6b7fd23cde", - "metadata": {}, - "outputs": [], - "source": [ - "# missing numeric values => mean\n", - "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n", - "# missing strings => empty string?\n", - "dataset.fillna('', inplace=True)\n", - "# drop all lines with missing values\n", - "dataset.dropna(inplace=True)" - ] - }, - { - "cell_type": "markdown", - "id": "ca5b59b9fa8160a0", - "metadata": {}, - "source": [ - "## Transform Genres\n", - "The genre information currently is a string holding a python array of genres. While this is machine-readable, we need One-Hot-Encoding for our model to work.\n", - "\n", - "#### Serializing the String-Array\n", - "The \"ast\" library can interpret python strings as python code, and as such will be used for serializing the genres." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebc5a24e9bc87fdd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 [Action, Free To Play]\n", - "1 [Action, Adventure, Massively Multiplayer, Fre...\n", - "2 [Action, Strategy, Free To Play]\n", - "3 [Action, Adventure]\n", - "4 [Action]\n", - "Name: genres, dtype: object\n" - ] - } - ], - "source": [ - "import ast\n", - "\n", - "dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))\n", - "print(dataset['genres'].head())" - ] - }, - { - "cell_type": "markdown", - "id": "f90756f9ad9211f4", - "metadata": {}, - "source": [ - "#### One-Hot-Encoding an Python-Array\n", - "The sklearn ``OneHotEncoder()`` is only able to work with an 1D Array of different classes, such as ``['Politics', 'Sport', 'Culture']``. Every datapoint can only have one concurrent classification.\n", - "Steam allows an app/bundle to have multiple genres. As such, our dataset has an 2D Array of different classes, which sklearn's ``MultiLabelBinarizer()`` does support." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2c3527a5fc876bf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Action Adventure Casual Early Access Free To Play Gore Indie \\\n", - "0 1 0 0 0 1 0 0 \n", - "1 1 1 0 0 1 0 0 \n", - "2 1 0 0 0 1 0 0 \n", - "3 1 1 0 0 0 0 0 \n", - "4 1 0 0 0 0 0 0 \n", - "\n", - " Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n", - "0 0 0 0 0 0 0 0 \n", - "1 1 0 0 0 0 0 0 \n", - "2 0 0 0 0 0 1 0 \n", - "3 0 0 0 0 0 0 0 \n", - "4 0 0 0 0 0 0 0 \n" - ] - } - ], - "source": [ - "from sklearn.preprocessing import MultiLabelBinarizer\n", - "\n", - "mlb_genres = MultiLabelBinarizer()\n", - "genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))\n", - "genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)\n", - "print(genres_df.head())" - ] - }, - { - "cell_type": "markdown", - "id": "671c01f9f4ae66d9", - "metadata": {}, - "source": [ - "With this, our target matrix is completed." - ] - }, - { - "cell_type": "markdown", - "id": "f5436c87", - "metadata": {}, - "source": [ - "### Structurizing Text\n", - "If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e8b407c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n", - "0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "\n", - " 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n", - "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - "[5 rows x 29351 columns]\n" - ] - } - ], - "source": [ - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "vectorizer = TfidfVectorizer()\n", - "tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n", - "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n", - "print(tfidf_df.head())" - ] - }, - { - "cell_type": "markdown", - "id": "ad84e777", - "metadata": {}, - "source": [ - "With this our feature matrix is completed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86d9da42f4df8e49", - "metadata": {}, - "outputs": [], - "source": [ - "X = tfidf_df\n", - "y = genres_df" - ] - }, - { - "cell_type": "markdown", - "id": "aeb782668f311cd8", - "metadata": {}, - "source": [ - "## The Model\n", - "\n", - "#### Removing unpredicatble Datapoints\n", - "Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n", - "We filter after all values that we can use with a mask, and apply that mask to our matrices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4919bf1b37d171a7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13\n" - ] - } - ], - "source": [ - "mask = y.sum(axis=1).map(lambda x: x > 0)\n", - "print((mask == False).sum()) # count of unpredictable datapoints\n", - "\n", - "X_clean = X[mask]\n", - "y_clean = y[mask]" - ] - }, - { - "cell_type": "markdown", - "id": "091d7e13", - "metadata": {}, - "source": [ - "# Splitting up data\n", - "We have to split up our data into training and testing data.\n", - "Using random_state=0 guarantees reproducability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfbf3787", - "metadata": { - "jupyter": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)" - ] - }, - { - "cell_type": "markdown", - "id": "12b5283d", - "metadata": {}, - "source": [ - "# Model Selection\n", - "**TODO Deciding which model to use for this task**\n", - "\n", - "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c1d72c4532bd509", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.multioutput import MultiOutputClassifier\n", - "\n", - "# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n", - "multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n", - "\n", - "multi_target_clf.fit(X_train, y_train)\n", - "\n", - "y_pred = multi_target_clf.predict(X_test)" - ] - }, - { - "cell_type": "markdown", - "id": "0faa9856", - "metadata": {}, - "source": [ - "# Evaluation\n", - "**TODO Test the Model with the test data**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2ebea6945193e07", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.78 0.91 0.84 300\n", - " 1 0.78 0.62 0.69 216\n", - " 2 1.00 0.03 0.07 86\n", - " 3 0.00 0.00 0.00 46\n", - " 4 1.00 0.04 0.07 83\n", - " 5 0.00 0.00 0.00 0\n", - " 6 0.79 0.81 0.80 245\n", - " 7 0.00 0.00 0.00 42\n", - " 8 0.90 0.34 0.49 127\n", - " 9 0.00 0.00 0.00 12\n", - " 10 0.89 0.25 0.39 127\n", - " 11 0.00 0.00 0.00 14\n", - " 12 0.88 0.14 0.24 106\n", - " 13 0.00 0.00 0.00 0\n", - "\n", - " micro avg 0.79 0.50 0.61 1404\n", - " macro avg 0.50 0.22 0.26 1404\n", - "weighted avg 0.77 0.50 0.53 1404\n", - " samples avg 0.77 0.56 0.60 1404\n", - "\n" - ] - } - ], - "source": [ - "from sklearn.metrics import classification_report\n", - "\n", - "print(classification_report(y_test, y_pred, zero_division=0.0))" - ] - }, - { - "cell_type": "markdown", - "id": "2aeb6fc2", - "metadata": {}, - "source": [ - "# Optimization\n", - "**TODO optimize the model based on the test results**" - ] - }, - { - "cell_type": "markdown", - "id": "79b20645", - "metadata": {}, - "source": [ - "# Validation\n", - "**TODO Predict actual values**" - ] - }, - { - "cell_type": "markdown", - "id": "3b709fb7", - "metadata": {}, - "source": [ - "# Conclusion and outlook\n", - "**TODO Write a conclusion and outlook what can be done and where the issues were.**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/plot_maker.py b/plot_maker.py new file mode 100644 index 0000000..331ae94 --- /dev/null +++ b/plot_maker.py @@ -0,0 +1,38 @@ +import os +import matplotlib.pyplot as plt + +datasets = { + "cleaned": "games_march2025_cleaned", + "cleaned_2k": "games_march2025_cleaned_2k", + "cleaned_10k": "games_march2025_cleaned_10k" +} +# def results +results = {} + +for dataset_name, folder in datasets.items(): + results[dataset_name] = {} + for filename in os.listdir(folder): + if filename.endswith(".txt"): + model_name = filename.replace(".txt", "") + with open(os.path.join(folder, filename), "r") as f: + for line in f: + if line.strip().startswith("weighted avg"): + parts = line.split() + f1_score = float(parts[3]) # precision recall f1-score support + results[dataset_name][model_name] = f1_score + +# Plot +models = sorted(results["cleaned"].keys()) # alphabetisch sortieren für gleiche Reihenfolge +x = range(len(models)) + +plt.figure(figsize=(12,6)) +plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned") +plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleaned_2k") +plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k") + +plt.xticks(x, models, rotation=45) +plt.ylabel("Weighted F1-Score") +plt.title("Model Performance across Datasets") +plt.legend() +plt.tight_layout() +plt.show() diff --git a/test_script.py b/test_script.py deleted file mode 100644 index de7e833..0000000 --- a/test_script.py +++ /dev/null @@ -1,133 +0,0 @@ - - -#### INITIALIZE - -import numpy as np -import pandas as pd -from sklearn import set_config -set_config(transform_output="pandas") # dataframe supremacy - -# load data -# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent -dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",") -print(dataset.head()) - - - - -#### DROP UNIQUES -print("DROP") - -#TODO: wird eh unten beim transformer deleted - -# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent -#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', -# 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], -# axis=1, inplace=True) -#print(dataset.head()) - -#### STRUCTURIZE AND STANDARDIZE -print("STRUCTURE") - -from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import FunctionTransformer - - -# desc, genres, tags -column_transformer = ColumnTransformer([ - # merge all descriptions - ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")), - ['detailed_description', 'about_the_game', 'short_description']), - # genre -> actual genre, but very coarse - # tags -> user defined tags; title num list - #TODO: decide whether we drop tags - ('pass', 'passthrough', ['genres']),#, 'tags' - ], - verbose_feature_names_out=False -) -dataset = column_transformer.fit_transform(dataset) -print(dataset) - - - -#### SET MISSING VALUES -print("SETMISS") - - -# Setting missing numeric values to the mean -dataset.fillna(dataset.mean(numeric_only=True), inplace=True) -# Setting missing text values to 'Unknown' -dataset.fillna('', inplace=True) -# Setting missing values in other columns to NaN -dataset.dropna(inplace=True) - - - - -##### STRUCTURIZE GENRES to onehot -from sklearn.preprocessing import MultiLabelBinarizer -import ast -#serialize array -dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) -print(dataset['genres']) # in py but not yet onehotenc - -# MultiLabelBinarizer does onehotenc for arrays -mlb_genres = MultiLabelBinarizer() -genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) -genres_count = len(mlb_genres.classes_) # for multi-label classifiction later - -genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) -print(genres_df) -#dataset = pd.concat([dataset, genres_df], axis=1) -#print(dataset) - - -#### convert text to bag of words - -## Count vs Tfidf vectorizer -from sklearn.feature_extraction.text import TfidfVectorizer -vectorizer = TfidfVectorizer() -tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix -tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) -print(tfidf_df) - - -##### MODEL -print("MODEL") - -from sklearn.linear_model import LogisticRegression -from sklearn.multioutput import MultiOutputClassifier -from sklearn.metrics import classification_report - - -X = tfidf_df -y = genres_df - - -# cleanup datapoints that dont have a target value (all target columns are 0) -mask = y.sum(axis=1).map(lambda x: x > 0) -#print((mask == False).sum()) #31 cases with all target columns 0 -X_clean = X[mask] -y_clean = y[mask] - -# Split dataset -from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0) - - -# we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier -# logi regression is our base system -# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad) -multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1) - -# model training -multi_target_clf.fit(X_train, y_train) - -# predict against test data -y_pred = multi_target_clf.predict(X_test) - -# print prec, recall, f1 etc -print(classification_report(y_test, y_pred, zero_division=0.0)) - - -#print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")