jupyter notebook missed some imports

This commit is contained in:
Tim
2025-08-13 13:56:30 +02:00
parent 9c3dd33c0b
commit 4b35d4ca21

View File

@@ -16,12 +16,43 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "3116b75f", "id": "3116b75f",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
"is_executing": true "is_executing": true
} }
}, },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" appid name release_date required_age price dlc_count \\\n",
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
"\n",
" detailed_description \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" about_the_game \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" short_description reviews ... \\\n",
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
"\n",
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
"0 879 5174 350 \n",
"\n",
" discount peak_ccu tags \\\n",
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
"\n",
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
"0 86 8632939 82 96473 \n",
"\n",
"[1 rows x 47 columns]\n"
]
}
],
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
@@ -30,10 +61,8 @@
"set_config(transform_output=\"pandas\")\n", "set_config(transform_output=\"pandas\")\n",
"\n", "\n",
"dataset = pd.read_csv(\"./games_march2025_cleaned_2k.csv\",sep=\",\")\n", "dataset = pd.read_csv(\"./games_march2025_cleaned_2k.csv\",sep=\",\")\n",
"print(dataset.head())" "print(dataset.head(1))"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -60,35 +89,58 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "d159117377f3633c",
"metadata": {},
"outputs": [],
"source": [ "source": [
"#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n", "#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
"#print(dataset.head())" "#print(dataset.head())"
], ]
"id": "d159117377f3633c",
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "e1b28ddd69f1e9a6",
"metadata": {},
"source": [ "source": [
"## Hold onto necessary information\n", "## Hold onto necessary information\n",
"Our model should turn a textual description of a game into its genre. For that we need all the textual information a game has, as well as the genres of the game.\n", "Our model should turn a textual description of a game into its genre. For that we need all the textual information a game has, as well as the genres of the game.\n",
"We use a ColumnTransformer to drop all unnecessary lines, merge all descriptions of a game into one big description and hold onto the genres\n", "We use a ColumnTransformer to drop all unnecessary lines, merge all descriptions of a game into one big description and hold onto the genres\n",
"\n", "\n",
"It is important to use ``verbose_feature_names_out=False`` so the feature names don't get changed" "It is important to use ``verbose_feature_names_out=False`` so the feature names don't get changed"
], ]
"id": "e1b28ddd69f1e9a6"
}, },
{ {
"cell_type": "code",
"execution_count": null,
"id": "986fbb31a7ae0d8b",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
"is_executing": true "is_executing": true
} }
}, },
"cell_type": "code", "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" desc \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
"\n",
" genres \n",
"0 ['Action', 'Free To Play'] \n",
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
"2 ['Action', 'Strategy', 'Free To Play'] \n",
"3 ['Action', 'Adventure'] \n",
"4 ['Action'] \n"
]
}
],
"source": [ "source": [
"from sklearn.compose import ColumnTransformer\n", "from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n",
@@ -104,24 +156,24 @@
")\n", ")\n",
"dataset = column_transformer.fit_transform(dataset)\n", "dataset = column_transformer.fit_transform(dataset)\n",
"print(dataset.head())" "print(dataset.head())"
], ]
"id": "986fbb31a7ae0d8b",
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "f9b89c0645811564",
"metadata": {},
"source": [ "source": [
"### Adding missing Information\n", "### Adding missing Information\n",
"Some Games might not have any descriptions. For these we Input an Empty String\n", "Some Games might not have any descriptions. For these we Input an Empty String\n",
"**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**" "**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
], ]
"id": "f9b89c0645811564"
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "44239f6b7fd23cde",
"metadata": {},
"outputs": [],
"source": [ "source": [
"# missing numeric values => mean\n", "# missing numeric values => mean\n",
"dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n", "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
@@ -129,49 +181,82 @@
"dataset.fillna('', inplace=True)\n", "dataset.fillna('', inplace=True)\n",
"# drop all lines with missing values\n", "# drop all lines with missing values\n",
"dataset.dropna(inplace=True)" "dataset.dropna(inplace=True)"
], ]
"id": "44239f6b7fd23cde",
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "ca5b59b9fa8160a0",
"metadata": {},
"source": [ "source": [
"## Transform Genres\n", "## Transform Genres\n",
"The genre information currently is a string holding a python array of genres. While this is machine-readable, we need One-Hot-Encoding for our model to work.\n", "The genre information currently is a string holding a python array of genres. While this is machine-readable, we need One-Hot-Encoding for our model to work.\n",
"\n", "\n",
"#### Serializing the String-Array\n", "#### Serializing the String-Array\n",
"The \"ast\" library can interpret python strings as python code, and as such will be used for serializing the genres." "The \"ast\" library can interpret python strings as python code, and as such will be used for serializing the genres."
], ]
"id": "ca5b59b9fa8160a0"
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "ebc5a24e9bc87fdd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 [Action, Free To Play]\n",
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
"2 [Action, Strategy, Free To Play]\n",
"3 [Action, Adventure]\n",
"4 [Action]\n",
"Name: genres, dtype: object\n"
]
}
],
"source": [ "source": [
"import ast\n", "import ast\n",
"\n", "\n",
"dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))\n", "dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))\n",
"print(dataset['genres'])" "print(dataset['genres'].head())"
], ]
"id": "ebc5a24e9bc87fdd",
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "f90756f9ad9211f4",
"metadata": {},
"source": [ "source": [
"#### One-Hot-Encoding an Python-Array\n", "#### One-Hot-Encoding an Python-Array\n",
"The sklearn ``OneHotEncoder()`` is only able to work with an 1D Array of different classes, such as ``['Politics', 'Sport', 'Culture']``. Every datapoint can only have one concurrent classification.\n", "The sklearn ``OneHotEncoder()`` is only able to work with an 1D Array of different classes, such as ``['Politics', 'Sport', 'Culture']``. Every datapoint can only have one concurrent classification.\n",
"Steam allows an app/bundle to have multiple genres. As such, our dataset has an 2D Array of different classes, which sklearn's ``MultiLabelBinarizer()`` does support." "Steam allows an app/bundle to have multiple genres. As such, our dataset has an 2D Array of different classes, which sklearn's ``MultiLabelBinarizer()`` does support."
], ]
"id": "f90756f9ad9211f4"
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "d2c3527a5fc876bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
"0 1 0 0 0 1 0 0 \n",
"1 1 1 0 0 1 0 0 \n",
"2 1 0 0 0 1 0 0 \n",
"3 1 1 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
"0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 0 \n",
"3 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 \n"
]
}
],
"source": [ "source": [
"from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n",
"\n", "\n",
@@ -179,16 +264,15 @@
"genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))\n", "genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))\n",
"genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)\n", "genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)\n",
"print(genres_df.head())" "print(genres_df.head())"
], ]
"id": "d2c3527a5fc876bf",
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"source": "With this, our target matrix is completed.", "id": "671c01f9f4ae66d9",
"id": "671c01f9f4ae66d9" "metadata": {},
"source": [
"With this, our target matrix is completed."
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -201,8 +285,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "4e8b407c", "id": "4e8b407c",
"metadata": {}, "metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 29351 columns]\n"
]
}
],
"source": [ "source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n", "\n",
@@ -210,52 +318,60 @@
"tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n", "tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
"tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n", "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
"print(tfidf_df.head())" "print(tfidf_df.head())"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "ad84e777", "id": "ad84e777",
"metadata": {}, "metadata": {},
"source": "With this our feature matrix is completed" "source": [
"With this our feature matrix is completed"
]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null, "execution_count": null,
"id": "86d9da42f4df8e49",
"metadata": {},
"outputs": [],
"source": [ "source": [
"X = tfidf_df\n", "X = tfidf_df\n",
"y = genres_df" "y = genres_df"
], ]
"id": "86d9da42f4df8e49"
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "aeb782668f311cd8",
"metadata": {},
"source": [ "source": [
"## The Model\n", "## The Model\n",
"\n", "\n",
"#### Removing unpredicatble Datapoints\n", "#### Removing unpredicatble Datapoints\n",
"Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n", "Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n",
"We filter after all values that we can use with a mask, and apply that mask to our matrices." "We filter after all values that we can use with a mask, and apply that mask to our matrices."
], ]
"id": "aeb782668f311cd8"
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null, "execution_count": null,
"id": "4919bf1b37d171a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"source": [ "source": [
"mask = y.sum(axis=1).map(lambda x: x > 0)\n", "mask = y.sum(axis=1).map(lambda x: x > 0)\n",
"print((mask == False).sum()) # count of unpredictable datapoints\n", "print((mask == False).sum()) # count of unpredictable datapoints\n",
"\n", "\n",
"X_clean = X[mask]\n", "X_clean = X[mask]\n",
"y_clean = y[mask]" "y_clean = y[mask]"
], ]
"id": "4919bf1b37d171a7"
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -269,19 +385,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "cfbf3787", "id": "cfbf3787",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
"is_executing": true "is_executing": true
} }
}, },
"outputs": [],
"source": [ "source": [
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"\n", "\n",
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)" "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -295,19 +411,22 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null, "execution_count": null,
"id": "8c1d72c4532bd509",
"metadata": {},
"outputs": [],
"source": [ "source": [
"# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)\n", "from sklearn.linear_model import LogisticRegression\n",
"from sklearn.multioutput import MultiOutputClassifier\n",
"\n",
"# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n",
"multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n", "multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n",
"\n", "\n",
"multi_target_clf.fit(X_train, y_train)\n", "multi_target_clf.fit(X_train, y_train)\n",
"\n", "\n",
"y_pred = multi_target_clf.predict(X_test)" "y_pred = multi_target_clf.predict(X_test)"
], ]
"id": "8c1d72c4532bd509"
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -319,12 +438,45 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null, "execution_count": null,
"source": "print(classification_report(y_test, y_pred, zero_division=0.0))", "id": "e2ebea6945193e07",
"id": "e2ebea6945193e07" "metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.78 0.91 0.84 300\n",
" 1 0.78 0.62 0.69 216\n",
" 2 1.00 0.03 0.07 86\n",
" 3 0.00 0.00 0.00 46\n",
" 4 1.00 0.04 0.07 83\n",
" 5 0.00 0.00 0.00 0\n",
" 6 0.79 0.81 0.80 245\n",
" 7 0.00 0.00 0.00 42\n",
" 8 0.90 0.34 0.49 127\n",
" 9 0.00 0.00 0.00 12\n",
" 10 0.89 0.25 0.39 127\n",
" 11 0.00 0.00 0.00 14\n",
" 12 0.88 0.14 0.24 106\n",
" 13 0.00 0.00 0.00 0\n",
"\n",
" micro avg 0.79 0.50 0.61 1404\n",
" macro avg 0.50 0.22 0.26 1404\n",
"weighted avg 0.77 0.50 0.53 1404\n",
" samples avg 0.77 0.56 0.60 1404\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"\n",
"print(classification_report(y_test, y_pred, zero_division=0.0))"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",