This commit is contained in:
Maximilian Kany
2025-08-19 13:26:17 +02:00

View File

@@ -2,6 +2,7 @@ import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn import set_config from sklearn import set_config
import gc
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
@@ -28,8 +29,9 @@ from sklearn.neural_network import MLPClassifier
set_config(transform_output="pandas") # dataframe supremacy set_config(transform_output="pandas") # dataframe supremacy
jobs = 3 jobs = 12
max_iter = 3000 max_iter = 3000
min_entries = 5
def prepDataset(dataset): #returns X_train, X_test, y_train, y_test def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
dataset = pd.read_csv(dataset,sep=",") dataset = pd.read_csv(dataset,sep=",")
@@ -73,12 +75,35 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
print("MODEL") print("MODEL")
X = tfidf_df X = tfidf_df
y = genres_df y = genres_df
# remove genres that have less than min_entries entries -> probability of broken split to big
mask = (y == 1).sum() >= min_entries
print(y.shape)
y_prep = y.loc[:, mask]
print(y_prep.shape)
del mask
del y
# cleanup datapoints that dont have a target value (all target columns are 0) # cleanup datapoints that dont have a target value (all target columns are 0)
mask = y.sum(axis=1).map(lambda x: x > 0) mask = y_prep.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0 #print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask] X_clean = X[mask]
y_clean = y[mask] y_clean = y_prep[mask]
print(y_clean)
# clean ram edition
del dataset
del column_transformer #-
del mlb_genres
del genres_encoded
del genres_df #-
del tfidf_df
del vectorizer
del tfidf_matrix #-
del X
del y_prep
del mask
gc.collect()
# Split dataset # Split dataset
return train_test_split(X_clean, y_clean, random_state=0) return train_test_split(X_clean, y_clean, random_state=0)
def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
@@ -96,7 +121,7 @@ datasets = [
estimators = { estimators = {
#"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), #"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
#"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), #"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
#"Perceptron": Perceptron(random_state=0, max_iter=max_iter), "Perceptron": Perceptron(random_state=0, max_iter=max_iter),
"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
"NearestCentroid": NearestCentroid(), "NearestCentroid": NearestCentroid(),
"LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),