From 717da9dd2215277c33301d8e6fb06484c83920a0 Mon Sep 17 00:00:00 2001 From: Tim <47184194+imgde@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:23:20 +0200 Subject: [PATCH] download more ram --- compare_models_10k.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/compare_models_10k.py b/compare_models_10k.py index dbd3121..7916541 100644 --- a/compare_models_10k.py +++ b/compare_models_10k.py @@ -2,6 +2,7 @@ import os import numpy as np import pandas as pd from sklearn import set_config +import gc from sklearn.compose import ColumnTransformer from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer @@ -30,6 +31,7 @@ set_config(transform_output="pandas") # dataframe supremacy jobs = 12 max_iter = 3000 +min_entries = 5 def prepDataset(dataset): #returns X_train, X_test, y_train, y_test dataset = pd.read_csv(dataset,sep=",") @@ -73,11 +75,35 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test print("MODEL") X = tfidf_df y = genres_df + + # remove genres that have less than min_entries entries -> probability of broken split to big + mask = (y == 1).sum() >= min_entries + print(y.shape) + y_prep = y.loc[:, mask] + print(y_prep.shape) + del mask + del y + # cleanup datapoints that dont have a target value (all target columns are 0) - mask = y.sum(axis=1).map(lambda x: x > 0) + mask = y_prep.sum(axis=1).map(lambda x: x > 0) #print((mask == False).sum()) #31 cases with all target columns 0 X_clean = X[mask] - y_clean = y[mask] + y_clean = y_prep[mask] + + # clean ram edition + del dataset + del column_transformer #- + del mlb_genres + del genres_encoded + del genres_df #- + del tfidf_df + del vectorizer + del tfidf_matrix #- + del X + del y_prep + del mask + gc.collect() + # Split dataset return train_test_split(X_clean, y_clean, random_state=0) def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report @@ -93,8 +119,8 @@ datasets = [ #'games_march2025_cleaned.csv' ] estimators = { - "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), - "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), + #"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), + #"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), "Perceptron": Perceptron(random_state=0, max_iter=max_iter), "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), "NearestCentroid": NearestCentroid(),