-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_dataset.py
63 lines (50 loc) · 2.35 KB
/
process_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
from collections import defaultdict
def process_data(file_path):
# Define a regex pattern to match lines containing only alphabetical characters
pattern = re.compile(r'^[a-zA-Z]+$')
# Read data and filter lines based on the regex
with open(file_path, 'r') as file:
lines = [line.strip() for line in file if line.strip() and pattern.match(line.strip())]
# Remove duplicates and sort the lines case-sensitively
unique_lines = sorted(set(lines), key=str)
# Write back to dataset.txt
with open(file_path, 'w') as file:
file.writelines(f"{line}\n" for line in unique_lines)
# Initialize dictionaries for stats using defaultdict
starts_with_count = defaultdict(int)
ends_with_count = defaultdict(int)
longest_words_start = {}
first_words = {}
last_words = {}
# Collect statistics
for line in unique_lines:
if line:
start_letter = line[0]
end_letter = line[-1]
# Count items starting with each letter
starts_with_count[start_letter] += 1
# Count items ending with each letter
ends_with_count[end_letter] += 1
# Track longest word starting with each letter
if start_letter not in longest_words_start or len(line) > len(longest_words_start[start_letter]):
longest_words_start[start_letter] = line
# Track first word starting with each letter
if start_letter not in first_words or line < first_words[start_letter]:
first_words[start_letter] = line
# Track last word starting with each letter
if start_letter not in last_words or line > last_words[start_letter]:
last_words[start_letter] = line
# Write statistics to boundaries.log
with open('boundaries.log', 'w') as log:
log.write(f"Total rows: {len(unique_lines)}\n")
for letter in sorted(set(starts_with_count) | set(ends_with_count)):
log.write(
f"{letter}: Starts {starts_with_count[letter]}, "
f"Ends {ends_with_count[letter]}, "
f"Longest {longest_words_start.get(letter, 'N/A')}, "
f"First {first_words.get(letter, 'N/A')}, "
f"Last {last_words.get(letter, 'N/A')}\n"
)
if __name__ == '__main__':
process_data('dataset.txt')