mismades were taken
This commit is contained in:
BIN
compare_dataset_sizes.png
Normal file
BIN
compare_dataset_sizes.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 62 KiB |
@@ -31,7 +31,7 @@ plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleane
|
||||
plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
|
||||
|
||||
plt.xticks(x, models, rotation=45)
|
||||
plt.ylabel("Weighted F1-Score")
|
||||
plt.ylabel("F1-Score")
|
||||
plt.title("Model Performance across Datasets")
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
BIN
compare_models_2k.png
Normal file
BIN
compare_models_2k.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 343 KiB |
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.75 0.90 0.82 300
|
||||
1 0.72 0.68 0.70 216
|
||||
2 0.50 0.08 0.14 86
|
||||
3 0.27 0.07 0.11 46
|
||||
4 0.40 0.07 0.12 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.82 0.79 245
|
||||
7 0.33 0.10 0.15 42
|
||||
8 0.67 0.40 0.50 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.71 0.37 0.49 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.49 0.31 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.70 0.55 0.62 1404
|
||||
macro avg 0.40 0.27 0.30 1404
|
||||
weighted avg 0.64 0.55 0.56 1404
|
||||
samples avg 0.73 0.59 0.61 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.73 0.75 300
|
||||
1 0.56 0.53 0.54 216
|
||||
2 0.36 0.33 0.34 86
|
||||
3 0.33 0.26 0.29 46
|
||||
4 0.40 0.46 0.43 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.65 0.61 0.63 245
|
||||
7 0.39 0.40 0.40 42
|
||||
8 0.59 0.57 0.58 127
|
||||
9 0.60 0.25 0.35 12
|
||||
10 0.56 0.51 0.53 127
|
||||
11 0.39 0.50 0.44 14
|
||||
12 0.52 0.49 0.50 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.58 0.55 0.57 1404
|
||||
macro avg 0.44 0.40 0.41 1404
|
||||
weighted avg 0.58 0.55 0.57 1404
|
||||
samples avg 0.59 0.59 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.80 0.78 300
|
||||
1 0.62 0.51 0.56 216
|
||||
2 0.63 0.14 0.23 86
|
||||
3 0.17 0.02 0.04 46
|
||||
4 0.42 0.10 0.16 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.68 0.66 0.67 245
|
||||
7 0.56 0.12 0.20 42
|
||||
8 0.55 0.33 0.41 127
|
||||
9 0.67 0.17 0.27 12
|
||||
10 0.65 0.31 0.42 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.53 0.29 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.66 0.47 0.55 1404
|
||||
macro avg 0.52 0.26 0.31 1404
|
||||
weighted avg 0.62 0.47 0.51 1404
|
||||
samples avg 0.67 0.53 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.80 0.83 300
|
||||
1 0.77 0.61 0.68 216
|
||||
2 0.55 0.13 0.21 86
|
||||
3 0.42 0.11 0.17 46
|
||||
4 0.68 0.33 0.44 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.71 0.76 0.74 245
|
||||
7 0.61 0.26 0.37 42
|
||||
8 0.81 0.50 0.61 127
|
||||
9 0.75 0.25 0.38 12
|
||||
10 0.81 0.54 0.65 127
|
||||
11 0.40 0.43 0.41 14
|
||||
12 0.69 0.42 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.76 0.57 0.65 1404
|
||||
macro avg 0.57 0.37 0.43 1404
|
||||
weighted avg 0.74 0.57 0.63 1404
|
||||
samples avg 0.76 0.63 0.65 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.87 0.86 300
|
||||
1 0.76 0.66 0.70 216
|
||||
2 0.77 0.20 0.31 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.76 0.27 0.39 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.81 0.79 245
|
||||
7 0.89 0.19 0.31 42
|
||||
8 0.77 0.60 0.67 127
|
||||
9 1.00 0.58 0.74 12
|
||||
10 0.85 0.54 0.66 127
|
||||
11 1.00 0.29 0.44 14
|
||||
12 0.82 0.42 0.56 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.61 0.69 1404
|
||||
macro avg 0.66 0.39 0.46 1404
|
||||
weighted avg 0.78 0.61 0.66 1404
|
||||
samples avg 0.81 0.67 0.69 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.84 0.85 0.84 300
|
||||
1 0.73 0.67 0.70 216
|
||||
2 0.74 0.30 0.43 86
|
||||
3 0.50 0.02 0.04 46
|
||||
4 0.69 0.24 0.36 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.79 0.79 245
|
||||
7 0.86 0.14 0.24 42
|
||||
8 0.76 0.63 0.69 127
|
||||
9 1.00 0.33 0.50 12
|
||||
10 0.81 0.52 0.63 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.75 0.41 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.60 0.68 1404
|
||||
macro avg 0.68 0.36 0.43 1404
|
||||
weighted avg 0.78 0.60 0.65 1404
|
||||
samples avg 0.80 0.66 0.68 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.64 0.99 0.78 300
|
||||
1 0.85 0.24 0.37 216
|
||||
2 0.60 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.80 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.80 0.79 245
|
||||
7 0.40 0.05 0.09 42
|
||||
8 1.00 0.04 0.08 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.20 0.01 0.02 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 1.00 0.05 0.09 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.69 0.40 0.51 1404
|
||||
macro avg 0.45 0.16 0.17 1404
|
||||
weighted avg 0.68 0.40 0.39 1404
|
||||
samples avg 0.70 0.44 0.50 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.80 0.88 0.84 300
|
||||
1 0.78 0.55 0.64 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.06 0.11 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.74 0.78 0.76 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.24 0.38 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.91 0.24 0.38 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 1.00 0.25 0.39 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.48 0.59 1404
|
||||
macro avg 0.58 0.23 0.27 1404
|
||||
weighted avg 0.78 0.48 0.52 1404
|
||||
samples avg 0.77 0.54 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.81 0.90 0.85 300
|
||||
1 0.76 0.63 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.83 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.40 0.54 127
|
||||
9 1.00 0.17 0.29 12
|
||||
10 0.90 0.34 0.49 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.92 0.21 0.34 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.53 0.63 1404
|
||||
macro avg 0.64 0.26 0.32 1404
|
||||
weighted avg 0.79 0.53 0.56 1404
|
||||
samples avg 0.79 0.59 0.63 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.75 0.90 0.82 300
|
||||
1 0.72 0.68 0.70 216
|
||||
2 0.50 0.08 0.14 86
|
||||
3 0.27 0.07 0.11 46
|
||||
4 0.40 0.07 0.12 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.82 0.79 245
|
||||
7 0.33 0.10 0.15 42
|
||||
8 0.67 0.40 0.50 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.71 0.37 0.49 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.49 0.31 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.70 0.55 0.62 1404
|
||||
macro avg 0.40 0.27 0.30 1404
|
||||
weighted avg 0.64 0.55 0.56 1404
|
||||
samples avg 0.73 0.59 0.61 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.73 0.75 300
|
||||
1 0.56 0.53 0.54 216
|
||||
2 0.36 0.33 0.34 86
|
||||
3 0.33 0.26 0.29 46
|
||||
4 0.40 0.46 0.43 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.65 0.61 0.63 245
|
||||
7 0.39 0.40 0.40 42
|
||||
8 0.59 0.57 0.58 127
|
||||
9 0.60 0.25 0.35 12
|
||||
10 0.56 0.51 0.53 127
|
||||
11 0.39 0.50 0.44 14
|
||||
12 0.52 0.49 0.50 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.58 0.55 0.57 1404
|
||||
macro avg 0.44 0.40 0.41 1404
|
||||
weighted avg 0.58 0.55 0.57 1404
|
||||
samples avg 0.59 0.59 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.80 0.78 300
|
||||
1 0.62 0.51 0.56 216
|
||||
2 0.63 0.14 0.23 86
|
||||
3 0.17 0.02 0.04 46
|
||||
4 0.42 0.10 0.16 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.68 0.66 0.67 245
|
||||
7 0.56 0.12 0.20 42
|
||||
8 0.55 0.33 0.41 127
|
||||
9 0.67 0.17 0.27 12
|
||||
10 0.65 0.31 0.42 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.53 0.29 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.66 0.47 0.55 1404
|
||||
macro avg 0.52 0.26 0.31 1404
|
||||
weighted avg 0.62 0.47 0.51 1404
|
||||
samples avg 0.67 0.53 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.80 0.83 300
|
||||
1 0.77 0.61 0.68 216
|
||||
2 0.55 0.13 0.21 86
|
||||
3 0.42 0.11 0.17 46
|
||||
4 0.68 0.33 0.44 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.71 0.76 0.74 245
|
||||
7 0.61 0.26 0.37 42
|
||||
8 0.81 0.50 0.61 127
|
||||
9 0.75 0.25 0.38 12
|
||||
10 0.81 0.54 0.65 127
|
||||
11 0.40 0.43 0.41 14
|
||||
12 0.69 0.42 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.76 0.57 0.65 1404
|
||||
macro avg 0.57 0.37 0.43 1404
|
||||
weighted avg 0.74 0.57 0.63 1404
|
||||
samples avg 0.76 0.63 0.65 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.87 0.86 300
|
||||
1 0.76 0.66 0.70 216
|
||||
2 0.77 0.20 0.31 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.76 0.27 0.39 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.81 0.79 245
|
||||
7 0.89 0.19 0.31 42
|
||||
8 0.77 0.60 0.67 127
|
||||
9 1.00 0.58 0.74 12
|
||||
10 0.85 0.54 0.66 127
|
||||
11 1.00 0.29 0.44 14
|
||||
12 0.82 0.42 0.56 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.61 0.69 1404
|
||||
macro avg 0.66 0.39 0.46 1404
|
||||
weighted avg 0.78 0.61 0.66 1404
|
||||
samples avg 0.81 0.67 0.69 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.84 0.85 0.84 300
|
||||
1 0.73 0.67 0.70 216
|
||||
2 0.74 0.30 0.43 86
|
||||
3 0.50 0.02 0.04 46
|
||||
4 0.69 0.24 0.36 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.79 0.79 245
|
||||
7 0.86 0.14 0.24 42
|
||||
8 0.76 0.63 0.69 127
|
||||
9 1.00 0.33 0.50 12
|
||||
10 0.81 0.52 0.63 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.75 0.41 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.60 0.68 1404
|
||||
macro avg 0.68 0.36 0.43 1404
|
||||
weighted avg 0.78 0.60 0.65 1404
|
||||
samples avg 0.80 0.66 0.68 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.64 0.99 0.78 300
|
||||
1 0.85 0.24 0.37 216
|
||||
2 0.60 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.80 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.80 0.79 245
|
||||
7 0.40 0.05 0.09 42
|
||||
8 1.00 0.04 0.08 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.20 0.01 0.02 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 1.00 0.05 0.09 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.69 0.40 0.51 1404
|
||||
macro avg 0.45 0.16 0.17 1404
|
||||
weighted avg 0.68 0.40 0.39 1404
|
||||
samples avg 0.70 0.44 0.50 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.80 0.88 0.84 300
|
||||
1 0.78 0.55 0.64 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.06 0.11 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.74 0.78 0.76 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.24 0.38 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.91 0.24 0.38 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 1.00 0.25 0.39 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.48 0.59 1404
|
||||
macro avg 0.58 0.23 0.27 1404
|
||||
weighted avg 0.78 0.48 0.52 1404
|
||||
samples avg 0.77 0.54 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.81 0.90 0.85 300
|
||||
1 0.76 0.63 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.83 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.40 0.54 127
|
||||
9 1.00 0.17 0.29 12
|
||||
10 0.90 0.34 0.49 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.92 0.21 0.34 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.53 0.63 1404
|
||||
macro avg 0.64 0.26 0.32 1404
|
||||
weighted avg 0.79 0.53 0.56 1404
|
||||
samples avg 0.79 0.59 0.63 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.75 0.90 0.82 300
|
||||
1 0.72 0.68 0.70 216
|
||||
2 0.50 0.08 0.14 86
|
||||
3 0.27 0.07 0.11 46
|
||||
4 0.40 0.07 0.12 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.82 0.79 245
|
||||
7 0.33 0.10 0.15 42
|
||||
8 0.67 0.40 0.50 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.71 0.37 0.49 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.49 0.31 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.70 0.55 0.62 1404
|
||||
macro avg 0.40 0.27 0.30 1404
|
||||
weighted avg 0.64 0.55 0.56 1404
|
||||
samples avg 0.73 0.59 0.61 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.73 0.75 300
|
||||
1 0.56 0.53 0.54 216
|
||||
2 0.36 0.33 0.34 86
|
||||
3 0.33 0.26 0.29 46
|
||||
4 0.40 0.46 0.43 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.65 0.61 0.63 245
|
||||
7 0.39 0.40 0.40 42
|
||||
8 0.59 0.57 0.58 127
|
||||
9 0.60 0.25 0.35 12
|
||||
10 0.56 0.51 0.53 127
|
||||
11 0.39 0.50 0.44 14
|
||||
12 0.52 0.49 0.50 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.58 0.55 0.57 1404
|
||||
macro avg 0.44 0.40 0.41 1404
|
||||
weighted avg 0.58 0.55 0.57 1404
|
||||
samples avg 0.59 0.59 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.76 0.80 0.78 300
|
||||
1 0.62 0.51 0.56 216
|
||||
2 0.63 0.14 0.23 86
|
||||
3 0.17 0.02 0.04 46
|
||||
4 0.42 0.10 0.16 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.68 0.66 0.67 245
|
||||
7 0.56 0.12 0.20 42
|
||||
8 0.55 0.33 0.41 127
|
||||
9 0.67 0.17 0.27 12
|
||||
10 0.65 0.31 0.42 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.53 0.29 0.38 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.66 0.47 0.55 1404
|
||||
macro avg 0.52 0.26 0.31 1404
|
||||
weighted avg 0.62 0.47 0.51 1404
|
||||
samples avg 0.67 0.53 0.55 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.80 0.83 300
|
||||
1 0.77 0.61 0.68 216
|
||||
2 0.55 0.13 0.21 86
|
||||
3 0.42 0.11 0.17 46
|
||||
4 0.68 0.33 0.44 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.71 0.76 0.74 245
|
||||
7 0.61 0.26 0.37 42
|
||||
8 0.81 0.50 0.61 127
|
||||
9 0.75 0.25 0.38 12
|
||||
10 0.81 0.54 0.65 127
|
||||
11 0.40 0.43 0.41 14
|
||||
12 0.69 0.42 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.76 0.57 0.65 1404
|
||||
macro avg 0.57 0.37 0.43 1404
|
||||
weighted avg 0.74 0.57 0.63 1404
|
||||
samples avg 0.76 0.63 0.65 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.85 0.87 0.86 300
|
||||
1 0.76 0.66 0.70 216
|
||||
2 0.77 0.20 0.31 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.76 0.27 0.39 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.81 0.79 245
|
||||
7 0.89 0.19 0.31 42
|
||||
8 0.77 0.60 0.67 127
|
||||
9 1.00 0.58 0.74 12
|
||||
10 0.85 0.54 0.66 127
|
||||
11 1.00 0.29 0.44 14
|
||||
12 0.82 0.42 0.56 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.61 0.69 1404
|
||||
macro avg 0.66 0.39 0.46 1404
|
||||
weighted avg 0.78 0.61 0.66 1404
|
||||
samples avg 0.81 0.67 0.69 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.78 0.91 0.84 300
|
||||
1 0.78 0.62 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.04 0.07 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.81 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.90 0.34 0.49 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.89 0.25 0.39 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 0.88 0.14 0.24 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.50 0.61 1404
|
||||
macro avg 0.50 0.22 0.26 1404
|
||||
weighted avg 0.77 0.50 0.53 1404
|
||||
samples avg 0.77 0.56 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.84 0.85 0.84 300
|
||||
1 0.73 0.67 0.70 216
|
||||
2 0.74 0.30 0.43 86
|
||||
3 0.50 0.02 0.04 46
|
||||
4 0.69 0.24 0.36 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.79 0.79 0.79 245
|
||||
7 0.86 0.14 0.24 42
|
||||
8 0.76 0.63 0.69 127
|
||||
9 1.00 0.33 0.50 12
|
||||
10 0.81 0.52 0.63 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.75 0.41 0.53 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.60 0.68 1404
|
||||
macro avg 0.68 0.36 0.43 1404
|
||||
weighted avg 0.78 0.60 0.65 1404
|
||||
samples avg 0.80 0.66 0.68 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.64 0.99 0.78 300
|
||||
1 0.85 0.24 0.37 216
|
||||
2 0.60 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 0.80 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.78 0.80 0.79 245
|
||||
7 0.40 0.05 0.09 42
|
||||
8 1.00 0.04 0.08 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.20 0.01 0.02 127
|
||||
11 0.00 0.00 0.00 14
|
||||
12 1.00 0.05 0.09 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.69 0.40 0.51 1404
|
||||
macro avg 0.45 0.16 0.17 1404
|
||||
weighted avg 0.68 0.40 0.39 1404
|
||||
samples avg 0.70 0.44 0.50 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.80 0.88 0.84 300
|
||||
1 0.78 0.55 0.64 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.06 0.11 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.74 0.78 0.76 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.24 0.38 127
|
||||
9 0.00 0.00 0.00 12
|
||||
10 0.91 0.24 0.38 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 1.00 0.25 0.39 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.79 0.48 0.59 1404
|
||||
macro avg 0.58 0.23 0.27 1404
|
||||
weighted avg 0.78 0.48 0.52 1404
|
||||
samples avg 0.77 0.54 0.60 1404
|
||||
@@ -1,21 +0,0 @@
|
||||
precision recall f1-score support
|
||||
|
||||
0 0.81 0.90 0.85 300
|
||||
1 0.76 0.63 0.69 216
|
||||
2 1.00 0.03 0.07 86
|
||||
3 0.00 0.00 0.00 46
|
||||
4 1.00 0.05 0.09 83
|
||||
5 0.00 0.00 0.00 0
|
||||
6 0.77 0.83 0.80 245
|
||||
7 0.00 0.00 0.00 42
|
||||
8 0.84 0.40 0.54 127
|
||||
9 1.00 0.17 0.29 12
|
||||
10 0.90 0.34 0.49 127
|
||||
11 1.00 0.14 0.25 14
|
||||
12 0.92 0.21 0.34 106
|
||||
13 0.00 0.00 0.00 0
|
||||
|
||||
micro avg 0.80 0.53 0.63 1404
|
||||
macro avg 0.64 0.26 0.32 1404
|
||||
weighted avg 0.79 0.53 0.56 1404
|
||||
samples avg 0.79 0.59 0.63 1404
|
||||
@@ -4,12 +4,8 @@ import pandas as pd
|
||||
from sklearn import set_config
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
|
||||
import ast
|
||||
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.multioutput import MultiOutputClassifier
|
||||
@@ -20,15 +16,19 @@ from sklearn.metrics import accuracy_score, classification_report
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier
|
||||
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
|
||||
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
|
||||
|
||||
set_config(transform_output="pandas") # dataframe supremacy
|
||||
|
||||
def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
|
||||
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
|
||||
dataset = pd.read_csv(dataset,sep=",")
|
||||
# desc, genres, tags
|
||||
column_transformer = ColumnTransformer([
|
||||
# merge all descriptions
|
||||
@@ -39,9 +39,6 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
|
||||
verbose_feature_names_out=False
|
||||
)
|
||||
dataset = column_transformer.fit_transform(dataset)
|
||||
|
||||
|
||||
|
||||
#### SET MISSING VALUES
|
||||
print("SETMISS")
|
||||
# Setting missing numeric values to the mean
|
||||
@@ -50,36 +47,26 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
|
||||
dataset.fillna('', inplace=True)
|
||||
# Setting missing values in other columns to NaN
|
||||
dataset.dropna(inplace=True)
|
||||
|
||||
##### STRUCTURIZE GENRES to onehot
|
||||
#serialize array
|
||||
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
|
||||
#print(dataset['genres']) # in py but not yet onehotenc
|
||||
|
||||
# MultiLabelBinarizer does onehotenc for arrays
|
||||
mlb_genres = MultiLabelBinarizer()
|
||||
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
|
||||
#genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
|
||||
|
||||
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
|
||||
#print(genres_df)
|
||||
#dataset = pd.concat([dataset, genres_df], axis=1)
|
||||
#print(dataset)
|
||||
|
||||
|
||||
#### convert text to bag of words
|
||||
|
||||
## Count vs Tfidf vectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
|
||||
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
||||
#print(tfidf_df)
|
||||
|
||||
|
||||
##### MODEL
|
||||
print("MODEL")
|
||||
|
||||
|
||||
X = tfidf_df
|
||||
y = genres_df
|
||||
# cleanup datapoints that dont have a target value (all target columns are 0)
|
||||
@@ -87,50 +74,63 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
|
||||
#print((mask == False).sum()) #31 cases with all target columns 0
|
||||
X_clean = X[mask]
|
||||
y_clean = y[mask]
|
||||
|
||||
# Split dataset
|
||||
return train_test_split(X_clean, y_clean, random_state=0)
|
||||
|
||||
def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
|
||||
multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
|
||||
|
||||
# model training
|
||||
multi_target_clf.fit(X_train, y_train)
|
||||
|
||||
# predict against test data
|
||||
y_pred = multi_target_clf.predict(X_test)
|
||||
return classification_report(y_test, y_pred, zero_division=0.0)
|
||||
|
||||
datasets = [
|
||||
'games_march2025_cleaned_2k.csv',
|
||||
'games_march2025_cleaned_10k.csv',
|
||||
'games_march2025_cleaned.csv'
|
||||
#'games_march2025_cleaned_10k.csv',
|
||||
#'games_march2025_cleaned.csv'
|
||||
]
|
||||
|
||||
max_iter = 3000 # <-- set your desired value here
|
||||
|
||||
estimators = {
|
||||
"LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0),
|
||||
"LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0),
|
||||
"LinearSVC-i5000": LinearSVC(max_iter=5000),
|
||||
"SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000),
|
||||
"LogisticRegression": LogisticRegression(random_state=0, max_iter=max_iter),
|
||||
"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
|
||||
"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
|
||||
"Perceptron": Perceptron(random_state=0, max_iter=max_iter),
|
||||
"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
|
||||
"KNeighborsClassifier": KNeighborsClassifier(),
|
||||
"NearestCentroid": NearestCentroid(),
|
||||
"RadiusNeighborsClassifier": RadiusNeighborsClassifier(),
|
||||
"LinearSVC-i5000": LinearSVC(random_state=0, max_iter=max_iter),
|
||||
"SVC": SVC(random_state=0, max_iter=max_iter),
|
||||
"DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
|
||||
"RandomForestClassifier": RandomForestClassifier(random_state=0),
|
||||
"ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
|
||||
"BaggingClassifier": BaggingClassifier(random_state=0),
|
||||
"AdaBoostClassifier": AdaBoostClassifier(random_state=0),
|
||||
"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
|
||||
"HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
|
||||
"GaussianNB": GaussianNB(),
|
||||
"MultinomialNB": MultinomialNB(),
|
||||
"BernoulliNB": BernoulliNB(),
|
||||
"ComplementNB": ComplementNB(),
|
||||
"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
|
||||
"QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
|
||||
"MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
|
||||
"DummyClassifier": DummyClassifier(random_state=0)
|
||||
}
|
||||
|
||||
#"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
|
||||
#"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
|
||||
for dataset in datasets:
|
||||
print("-" * 60)
|
||||
print("dataset -> " + dataset)
|
||||
print("-" * 60)
|
||||
print("mkdir")
|
||||
folder = dataset.split(".csv")[0]
|
||||
if not os.path.isdir(folder):
|
||||
os.mkdir(folder)
|
||||
X_train, X_test, y_train, y_test = prepDataset(dataset)
|
||||
for esti in estimators:
|
||||
print("model: " + esti)
|
||||
compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
|
||||
print("open")
|
||||
f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
|
||||
183
notebook.ipynb
183
notebook.ipynb
@@ -23,36 +23,7 @@
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" appid name release_date required_age price dlc_count \\\n",
|
||||
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
|
||||
"\n",
|
||||
" detailed_description \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"\n",
|
||||
" about_the_game \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"\n",
|
||||
" short_description reviews ... \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
|
||||
"\n",
|
||||
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
|
||||
"0 879 5174 350 \n",
|
||||
"\n",
|
||||
" discount peak_ccu tags \\\n",
|
||||
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
|
||||
"\n",
|
||||
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
|
||||
"0 86 8632939 82 96473 \n",
|
||||
"\n",
|
||||
"[1 rows x 47 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
@@ -120,27 +91,7 @@
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" desc \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
|
||||
"2 The most-played game on Steam. Every day, mill... \n",
|
||||
"3 When a young street hustler, a retired bank ro... \n",
|
||||
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
|
||||
"\n",
|
||||
" genres \n",
|
||||
"0 ['Action', 'Free To Play'] \n",
|
||||
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
|
||||
"2 ['Action', 'Strategy', 'Free To Play'] \n",
|
||||
"3 ['Action', 'Adventure'] \n",
|
||||
"4 ['Action'] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.preprocessing import FunctionTransformer\n",
|
||||
@@ -200,20 +151,7 @@
|
||||
"execution_count": null,
|
||||
"id": "ebc5a24e9bc87fdd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 [Action, Free To Play]\n",
|
||||
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
|
||||
"2 [Action, Strategy, Free To Play]\n",
|
||||
"3 [Action, Adventure]\n",
|
||||
"4 [Action]\n",
|
||||
"Name: genres, dtype: object\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ast\n",
|
||||
"\n",
|
||||
@@ -236,27 +174,7 @@
|
||||
"execution_count": null,
|
||||
"id": "d2c3527a5fc876bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
|
||||
"0 1 0 0 0 1 0 0 \n",
|
||||
"1 1 1 0 0 1 0 0 \n",
|
||||
"2 1 0 0 0 1 0 0 \n",
|
||||
"3 1 1 0 0 0 0 0 \n",
|
||||
"4 1 0 0 0 0 0 0 \n",
|
||||
"\n",
|
||||
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
|
||||
"0 0 0 0 0 0 0 0 \n",
|
||||
"1 1 0 0 0 0 0 0 \n",
|
||||
"2 0 0 0 0 0 1 0 \n",
|
||||
"3 0 0 0 0 0 0 0 \n",
|
||||
"4 0 0 0 0 0 0 0 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
||||
"\n",
|
||||
@@ -288,29 +206,7 @@
|
||||
"execution_count": null,
|
||||
"id": "4e8b407c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
|
||||
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"\n",
|
||||
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
|
||||
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 29351 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
@@ -356,15 +252,7 @@
|
||||
"execution_count": null,
|
||||
"id": "4919bf1b37d171a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"13\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
|
||||
"print((mask == False).sum()) # count of unpredictable datapoints\n",
|
||||
@@ -399,12 +287,38 @@
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "84f56229",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now that all data is prepared, we need to choose a Classification Model that meets our stanadrds."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "917ba82f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Excursion: Choosing a classification Model\n",
|
||||
"``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
|
||||
"As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
|
||||
"\n",
|
||||
"### The comparison\n",
|
||||
"We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
|
||||
"There were some rules as a baseline for comparison:\n",
|
||||
"- All Hyperparameters are set to default\n",
|
||||
"- All iteration limits are set to 3000\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12b5283d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model Selection\n",
|
||||
"## Model Selection\n",
|
||||
"**TODO Deciding which model to use for this task**\n",
|
||||
"\n",
|
||||
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
|
||||
@@ -442,36 +356,7 @@
|
||||
"execution_count": null,
|
||||
"id": "e2ebea6945193e07",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.78 0.91 0.84 300\n",
|
||||
" 1 0.78 0.62 0.69 216\n",
|
||||
" 2 1.00 0.03 0.07 86\n",
|
||||
" 3 0.00 0.00 0.00 46\n",
|
||||
" 4 1.00 0.04 0.07 83\n",
|
||||
" 5 0.00 0.00 0.00 0\n",
|
||||
" 6 0.79 0.81 0.80 245\n",
|
||||
" 7 0.00 0.00 0.00 42\n",
|
||||
" 8 0.90 0.34 0.49 127\n",
|
||||
" 9 0.00 0.00 0.00 12\n",
|
||||
" 10 0.89 0.25 0.39 127\n",
|
||||
" 11 0.00 0.00 0.00 14\n",
|
||||
" 12 0.88 0.14 0.24 106\n",
|
||||
" 13 0.00 0.00 0.00 0\n",
|
||||
"\n",
|
||||
" micro avg 0.79 0.50 0.61 1404\n",
|
||||
" macro avg 0.50 0.22 0.26 1404\n",
|
||||
"weighted avg 0.77 0.50 0.53 1404\n",
|
||||
" samples avg 0.77 0.56 0.60 1404\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"\n",
|
||||
|
||||
Reference in New Issue
Block a user