-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscorecalculator
executable file
·191 lines (149 loc) · 7.29 KB
/
scorecalculator
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python3
"""Calculate the total score of various choices by ranking and combining individual scores.
Written 2024 by Christian Siefkes.
Permission to use, copy, modify, and/or distribute this software for any purpose with or
without fee is hereby granted. No rights reserved.
This program reads data from a CSV file and modifies in in place, adding the calculated total
score and sorting the entries according to it.
The file format is assumed to be as follows:
* The first row is the header row; it is left unchanged.
* Columns whose header ends in "+)" are assumed to give a ranking where more is better, e.g. a
star rating or a rating/review count.
* Columns whose header ends in "-)" are assumed to give a ranking where less/smaller is better,
e.g. a price or distance.
* The total score will be written to the column with the header "Score". If there is no such
column, it will be created after all other columns. If it exists already, its previous
contents will be overwritten.
For each of the scoring criteria (column headers ending in "+)" or "-)"), the program assigns
scores from 1 (best) to n (worst), where n is the total number of items in the file. If several
items have the same value in one column, they are all given the same score (e.g. if the best
"stars" score is 5.0 and 3 items have it, they will all get score 1 for that column, while the
next item will then get score 4 – score counting continues in the usual way).
The calculated total score, as well as its components, will be written into the "Score" column.
For m scored columns the best possible score is m (1 in each column), hence e.g. "3 (1+1+1)" in
case of 3 components. Items are sorted by total score and, in case of ties, by the score
components and then the original order.
The original input file will be renamed by adding ".bak" to its name.
"""
import csv
import os
import re
import sys
from typing import List, Set
def rank_floats_with_ties(float_list: List[float], ascending: bool = True) -> List[int]:
"""
Ranks a list of floats, assigning the same rank to equal values (ties).
Parameters:
* float_list: The list of floats to be ranked
* ascending: if True, the smallest value gets rank 1, otherwise the largest value gets rank 1
Returns a list of ranks corresponding to the original float list.
"""
sort_order = 1 if ascending else -1
# Sort the indices of the list based on their values
sorted_indices = sorted(range(len(float_list)), key=lambda x: sort_order * float_list[x])
# Create a list to store the ranks
ranks = [0] * len(float_list)
current_rank = 1
prev_value = None
for pos, index in enumerate(sorted_indices):
# If the current value is the same as the previous value, assign the same rank
if prev_value is not None and float_list[index] == prev_value:
ranks[index] = current_rank
else:
# Update the rank to be the current position + 1
current_rank = pos + 1
ranks[index] = current_rank
prev_value = float_list[index]
return ranks
def format_total_scores(all_specific_scores: List[List[int]]) -> List[str]:
"""Calculate and return the total scores based on a nested list of specific scores.
Each element in the result list will be the total score for that row, followed by its
component scores, e.g. '7 (3+2+2)'.
"""
# Initialize a list with the size of the inner lists, filled with empty strings
result: List[int] = [''] * len(all_specific_scores[0])
# Calculate and format each total score
for row_idx in range(len(all_specific_scores[0])):
total = 0
parts = ''
for col_idx in range(len(all_specific_scores)):
this_score = all_specific_scores[col_idx][row_idx]
total += this_score
parts += f'+{this_score}'
parts = parts.lstrip('+')
result[row_idx] = f'{total} ({parts})'
return result
def calculate_scores(data: List[List[str]], pos_cols: Set[int], neg_cols: Set[int],
score_col: int) -> List[List[str]]:
"""Scores the data from a CSV file based on identified negative and positive columns.
`data` is the original data, `pos_cols` is the set of positive columns (biggest value is
best), `neg_cols` is the set of negative columns (smallest value is best), `score_col` is
the column into which to write the combined score.
Returns a copy of the data with the combined score added, but not yet sorted.
"""
col_count = len(data[0])
all_specific_scores: List[List[int]] = []
# Calculate specific scores
for idx in range(col_count):
if idx in pos_cols or idx in neg_cols:
specific_values = [float(row[idx]) for row in data[1:]] # Skip header
ascending = idx in neg_cols
specific_scores = rank_floats_with_ties(specific_values, ascending)
all_specific_scores.append(specific_scores)
# Fill rows in score column
for idx, score in enumerate(format_total_scores(all_specific_scores), start=1):
row = data[idx]
# First make sure the row is long enough
if score_col >= len(row):
row.extend([''] * (score_col + 1 - len(row)))
row[score_col] = score
return data
def score_file(filename: str) -> None:
"""Score a CSV file, finally writing it back with the total score added."""
# Read input CSV
with open(filename, newline='', encoding='utf-8') as f:
data = list(csv.reader(f))
header_row = data[0]
pos_cols = set()
neg_cols = set()
score_col = None
# Check which rows to score positively: "+)", which ones negatively: "-)", and where
# to store the total "Score"
for idx, header in enumerate(header_row):
header = header.strip()
if header.endswith('+)'):
pos_cols.add(idx)
elif header.endswith('-)'):
neg_cols.add(idx)
elif header == 'Score':
score_col = idx
# Make sure that we got at least one column to score
if len(pos_cols) + len(neg_cols) == 0:
print('Error: No columns to score found (end the header with "+)" or "-)"')
sys.exit(1)
# Add score column, if needed
if score_col is None:
score_col = len(header_row)
header_row.append('Score')
data[0] = header_row
# Calculate and add scores
data = calculate_scores(data, pos_cols, neg_cols, score_col)
# Sort the data (except for the header) by the integers in the score column, so the sorting
# will by chiefly by the total score, with the first, second etc. component scores used as
# tiebreakers
data[1:] = sorted(data[1:],
key=lambda row: [int(num) for num in re.findall(r'\d+', row[score_col])])
# Rename original file by adding '.bak'
os.replace(filename, filename + '.bak')
# Write new version of the file that includes the scores
with open(filename, 'w', newline='', encoding='utf-8') as f:
csv.writer(f).writerows(data)
if __name__ == "__main__":
if len(sys.argv) > 1:
# Score all files specified as command-line arguments
for filename in sys.argv[1:]:
score_file(filename)
else:
# Print an error/help message if no arguments are provided
print('Error: Specify one or more files to score as arguments')
sys.exit(1)