-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy path3_word_cloud_data.cpp
147 lines (126 loc) · 4.97 KB
/
3_word_cloud_data.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/**
* Count the frequency of each word in string
*
* Time Complexity: O(n)
* Space Complexity: O(n)
*
* * What I learned:
*
* 1) how to split words from string
* - CHECKPOINTS: handle the end of the input string,
* hyphenated words,
* punctuation ( . , ! ? ' () ... ),
* and edge cases
* -> use isalpha() and check LENGTH for each word
*
* cf) if thought of building the word character by character (using +=),
* parsing into words costs O(n^2) time
* (bcuz each time new char is appended -> creates whole new string)
*
* => instead, keep track of START INDEX and LENGTH of word: O(n) time
* or could use stringstream
*
* 2) handle duplicate words w/ different cases: (ex. how to handle "Blue", "blue", "BLue", etc.)
* -> various options (none are perfect; each has pros and cons)
*
* - ex. only store a word uppercase if it is always uppercase in the original string,
* store a word uppercase, if it is ever uppercase in the original string
* use an API or other tool that identifies proper nouns,
* ignore case entirely and make every word lowercase,
* etc.
* => come up with several heuristics, weigh them carefully, and choose the best solution for the given context
*
* 3) take advantage of built-in functions:
* - chars: isalpha(), islower(), isupper(), isdigit(), tolower(), toupper()
* - strings: transform(s.begin(), s.end(), s.begin(), ::tolower),
*
* 4) use a CLASS:
* (allows us to tie our methods together,
* calling them on instances of our class INSTEAD OF PASSING REFERENCES)
*
*/
#include <bits/stdc++.h>
using namespace std;
/**
* Define class WordCloudData
* which contains wordsToCounts_ data = map of frequency of words
* and methods to populate the wordsToCounds data
*
*/
class WordCloudData {
private:
unordered_map<string, size_t> wordsToCounts_;
/**
* helper function to count frequencies of each word
* (handles specifics on how to deal w/ uppercase /lowercase words)
*/
void addWordToHashMap(string word) {
// if word already exists in hash map,
if (wordsToCounts_.count(word) > 0) {
// increment its count
wordsToCounts_[word] += 1;
}
// if word does not exist,
else {
string lowercaseWord = word;
lowercaseWord[0] = tolower(lowercaseWord[0]);
string capitalizedWord = word;
capitalizedWord[0] = toupper(capitalizedWord[0]);
// if word is capitalized and lowercase version exists in hash map,
if (isupper(word[0]) && wordsToCounts_.count(lowercaseWord) > 0) {
// include as lowercase version
wordsToCounts_[lowercaseWord] += 1;
}
// else, if word is lowercase and capitalized version exists in hash map
else if (islower(word[0]) && wordsToCounts_.count(capitalizedWord) > 0) {
// include merge two versions into lowercase version
wordsToCounts_[word] = wordsToCounts_[capitalizedWord] + 1;
wordsToCounts_.erase(capitalizedWord);
}
else {
// add to hash map as is
wordsToCounts_[word] = 1;
}
}
}
/**
* count the frequency of each word in string
* (split the string into words and then,
* pass each word to helper function addWordToHashMap)
*
* -> CAREFUL w/ hyphenated words, punctuation, etc.
*/
void populateWordsToCounts(const string &inputString) {
// split string into words (careful of punctuations!)
vector<string> words;
int startIndex = 0;
for (int i = 0, n = inputString.length(); i <= n; i++) {
// if letter is not alphabet or end of string
if (!isalpha(inputString[i]) || i == n) {
// if end of word (length of word > 0)
if (i > startIndex) {
// include apostrophe (') and hyphen (-) between words
if (inputString[i] == '\'' || inputString[i] == '-') {
continue;
}
// parse word
int length = i - startIndex;
string word = inputString.substr(startIndex, length);
}
// update start to new word
startIndex = i + 1;
}
}
}
public:
/** constructor for WordCloudData object
* (counts frequency of each word in given string using an unordered map)
*/
WordCloudData(const string &inputString) {
populateWordsToCounts(inputString);
}
// returns unordered map (frequency count of each word)
const unordered_map<string, size_t> getWordsToCounts() const {
return wordsToCounts_;
}
};