-
-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixing some performance bottlenecks #183
Conversation
line_profiler test before changes:
Total time: 3.58387 s
File: /charset_normalizer-master/charset_normalizer/md.py
Function: feed at line 140
Line # Hits Time Per Hit % Time Line Contents
==============================================================
140 @profile
141 def feed(self, character: str) -> None:
142 4654032 1089618.0 0.2 30.4 if (
143 2494830 899525.0 0.4 25.1 character.isspace() is False # includes \n \t \r \v
144 2144442 721900.0 0.3 20.1 and character.isprintable() is False
145 14760 4099.0 0.3 0.1 and character != "\x1A" # Why? Its the ASCII substitute character.
146 ):
147 14585 4963.0 0.3 0.1 self._unprintable_count += 1
148 2494830 863770.0 0.3 24.1 self._character_count += 1
Total time: 7.31769 s
File: /charset_normalizer-master/charset_normalizer/cd.py
Function: characters_popularity_compare at line 165
Line # Hits Time Per Hit % Time Line Contents
==============================================================
165 @profile
166 def characters_popularity_compare(
167 language: str, ordered_characters: List[str]
168 ) -> float:
169 """
170 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
171 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
172 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
173 """
174 17568 9993.0 0.6 0.1 if language not in FREQUENCIES:
175 raise ValueError("{} not available".format(language))
176
177 17568 7660.0 0.4 0.1 character_approved_count = 0 # type: int
178
179 531331 214377.0 0.4 2.9 for character in ordered_characters:
180 513763 324669.0 0.6 4.4 if character not in FREQUENCIES[language]:
181 241494 96083.0 0.4 1.3 continue
182
183 544538 248351.0 0.5 3.4 characters_before_source = FREQUENCIES[language][
184 272269 182466.0 0.7 2.5 0 : FREQUENCIES[language].index(character)
185 ] # type: List[str]
186 544538 247284.0 0.5 3.4 characters_after_source = FREQUENCIES[language][
187 272269 176275.0 0.6 2.4 FREQUENCIES[language].index(character) :
188 ] # type: List[str]
189
190 544538 240014.0 0.4 3.3 characters_before = ordered_characters[
191 272269 173583.0 0.6 2.4 0 : ordered_characters.index(character)
192 ] # type: List[str]
193 544538 243403.0 0.4 3.3 characters_after = ordered_characters[
194 272269 170004.0 0.6 2.3 ordered_characters.index(character) :
195 ] # type: List[str]
196
197 816807 1396719.0 1.7 19.1 before_match_count = [
198 272269 109089.0 0.4 1.5 e in characters_before for e in characters_before_source
199 544538 254256.0 0.5 3.5 ].count(
200 272269 108328.0 0.4 1.5 True
201 ) # type: int
202 816807 1854127.0 2.3 25.3 after_match_count = [
203 272269 109019.0 0.4 1.5 e in characters_after for e in characters_after_source
204 544538 264154.0 0.5 3.6 ].count(
205 272269 108751.0 0.4 1.5 True
206 ) # type: int
207
208 272269 136920.0 0.5 1.9 if len(characters_before_source) == 0 and before_match_count <= 4:
209 12369 5471.0 0.4 0.1 character_approved_count += 1
210 12369 5580.0 0.5 0.1 continue
211
212 259900 121263.0 0.5 1.7 if len(characters_after_source) == 0 and after_match_count <= 4:
213 character_approved_count += 1
214 continue
215
216 306500 120823.0 0.4 1.7 if (
217 259900 135166.0 0.5 1.8 before_match_count / len(characters_before_source) >= 0.4
218 46600 23037.0 0.5 0.3 or after_match_count / len(characters_after_source) >= 0.4
219 ):
220 252359 108945.0 0.4 1.5 character_approved_count += 1
221 252359 113096.0 0.4 1.5 continue
222
223 17568 8788.0 0.5 0.1 return character_approved_count / len(ordered_characters) line_profiler test after changes:
Total time: 1.62655 s
File: /charset_normalizer-enhanced/charset_normalizer/md.py
Function: feed at line 141
Line # Hits Time Per Hit % Time Line Contents
==============================================================
141 @profile
142 def feed(self, character: str) -> None:
143 2494830 821098.0 0.3 50.5 if is_unprintable(character):
144 14585 4548.0 0.3 0.3 self._unprintable_count += 1
145 2494830 800900.0 0.3 49.2 self._character_count += 1
Total time: 0.008891 s
File: /charset_normalizer-enhanced/charset_normalizer/utils.py
Function: is_unprintable at line 201
Line # Hits Time Per Hit % Time Line Contents
==============================================================
201 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
202 @profile
203 def is_unprintable(character: str) -> bool:
204 14995 3686.0 0.2 41.5 if (
205 7354 2635.0 0.4 29.6 character.isspace() is False # includes \n \t \r \v
206 7338 2419.0 0.3 27.2 and character.isprintable() is False
207 303 84.0 0.3 0.9 and character != "\x1A" # Why? Its the ASCII substitute character.
208 ):
209 302 67.0 0.2 0.8 return True
Total time: 3.49171 s
File: /charset_normalizer-enhanced/charset_normalizer/cd.py
Function: characters_popularity_compare at line 165
Line # Hits Time Per Hit % Time Line Contents
==============================================================
165 @profile
166 def characters_popularity_compare(
167 language: str, ordered_characters: List[str]
168 ) -> float:
169 """
170 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
171 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
172 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
173 """
174 17568 7717.0 0.4 0.2 if language not in FREQUENCIES:
175 raise ValueError("{} not available".format(language))
176
177 17568 5726.0 0.3 0.2 character_approved_count = 0 # type: int
178 17568 17256.0 1.0 0.5 FREQUENCIES_language_set = set(FREQUENCIES[language])
179
180 531331 158858.0 0.3 4.5 for character in ordered_characters:
181 513763 169012.0 0.3 4.8 if character not in FREQUENCIES_language_set:
182 241494 70204.0 0.3 2.0 continue
183
184 544538 193489.0 0.4 5.5 characters_before_source = FREQUENCIES[language][
185 272269 155254.0 0.6 4.4 0 : FREQUENCIES[language].index(character)
186 ] # type: List[str]
187 544538 190095.0 0.3 5.4 characters_after_source = FREQUENCIES[language][
188 272269 150943.0 0.6 4.3 FREQUENCIES[language].index(character) :
189 ] # type: List[str]
190 544538 180382.0 0.3 5.2 characters_before = ordered_characters[
191 272269 144907.0 0.5 4.2 0 : ordered_characters.index(character)
192 ] # type: List[str]
193 544538 184880.0 0.3 5.3 characters_after = ordered_characters[
194 272269 142245.0 0.5 4.1 ordered_characters.index(character) :
195 ] # type: List[str]
196
197 544538 187248.0 0.3 5.4 before_match_count = len(
198 272269 317993.0 1.2 9.1 set(characters_before) & set(characters_before_source)
199 ) # type: int
200
201 544538 192450.0 0.4 5.5 after_match_count = len(
202 272269 429966.0 1.6 12.3 set(characters_after) & set(characters_after_source)
203 ) # type: int
204
205 272269 104768.0 0.4 3.0 if len(characters_before_source) == 0 and before_match_count <= 4:
206 12369 4154.0 0.3 0.1 character_approved_count += 1
207 12369 4140.0 0.3 0.1 continue
208
209 259900 93895.0 0.4 2.7 if len(characters_after_source) == 0 and after_match_count <= 4:
210 character_approved_count += 1
211 continue
212
213 306500 89322.0 0.3 2.6 if (
214 259900 103875.0 0.4 3.0 before_match_count / len(characters_before_source) >= 0.4
215 46600 17576.0 0.4 0.5 or after_match_count / len(characters_after_source) >= 0.4
216 ):
217 252359 82925.0 0.3 2.4 character_approved_count += 1
218 252359 85573.0 0.3 2.5 continue
219
220 17568 6854.0 0.4 0.2 return character_approved_count / len(ordered_characters) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have put some through in the originally opened issue. Plus some minor remark here.
Codecov Report
@@ Coverage Diff @@
## master #183 +/- ##
==========================================
+ Coverage 89.74% 89.79% +0.05%
==========================================
Files 11 11
Lines 1199 1205 +6
==========================================
+ Hits 1076 1082 +6
Misses 123 123
Continue to review full report at Codecov.
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
+1 small correction $ python3 -m timeit '"".join(["Ab", "T"])'
5000000 loops, best of 5: 73.1 nsec per loop
$ python3 -m timeit '"Ab" + "T"'
50000000 loops, best of 5: 6.28 nsec per loop |
Total time: 11.4444 s
File: /charset_normalizer/charset_normalizer/md.py
Function: feed at line 266
Line # Hits Time Per Hit % Time Line Contents
==============================================================
266 @profile
267 def feed(self, character: str) -> None:
268 2494830 1233959.0 0.5 10.8 if character.isalpha():
269 #self._buffer = self._buffer + character
270 1784406 1087571.0 0.6 9.5 self._buffer = "".join((self._buffer, character))
271 1784406 866044.0 0.5 7.6 if is_accentuated(character):
272 58990 30211.0 0.5 0.3 self._buffer_accent_count += 1
273 5236026 2043855.0 0.4 17.9 if (
274 1784406 738504.0 0.4 6.5 self._foreign_long_watch is False
275 2849962 1328547.0 0.5 11.6 and (is_latin(character) is False or is_accentuated(character))
276 163719 86871.0 0.5 0.8 and is_cjk(character) is False
277 128145 65339.0 0.5 0.6 and is_hangul(character) is False
278 110994 55179.0 0.5 0.5 and is_katakana(character) is False
279 106144 52983.0 0.5 0.5 and is_hiragana(character) is False
280 92656 45662.0 0.5 0.4 and is_thai(character) is False
281 ):
282 85725 37543.0 0.4 0.3 self._foreign_long_watch = True
283 1784406 698397.0 0.4 6.1 return
284 710424 315180.0 0.4 2.8 if not self._buffer:
285 307642 119599.0 0.4 1.0 return
286 1019091 400081.0 0.4 3.5 if (
287 667451 315987.0 0.5 2.8 character.isspace() or is_punctuation(character) or is_separator(character)
288 351640 144197.0 0.4 1.3 ) and self._buffer:
289 351640 178431.0 0.5 1.6 self._word_count += 1
290 351640 165405.0 0.5 1.4 buffer_length = len(self._buffer) # type: int
291
292 351640 169634.0 0.5 1.5 self._character_count += buffer_length
293
294 351640 148625.0 0.4 1.3 if buffer_length >= 4:
295 217909 107408.0 0.5 0.9 if self._buffer_accent_count / buffer_length > 0.34:
296 4206 1867.0 0.4 0.0 self._is_current_word_bad = True
297 # Word/Buffer ending with a upper case accentuated letter are so rare,
298 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
299 217909 124633.0 0.6 1.1 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
300 1716 859.0 0.5 0.0 self._foreign_long_count += 1
301 1716 766.0 0.4 0.0 self._is_current_word_bad = True
302 351640 147325.0 0.4 1.3 if buffer_length >= 24 and self._foreign_long_watch:
303 183 85.0 0.5 0.0 self._foreign_long_count += 1
304 183 75.0 0.4 0.0 self._is_current_word_bad = True
305
306 351640 144729.0 0.4 1.3 if self._is_current_word_bad:
307 11671 5501.0 0.5 0.0 self._bad_word_count += 1
308 11671 6307.0 0.5 0.1 self._bad_character_count += len(self._buffer)
309 11671 4893.0 0.4 0.0 self._is_current_word_bad = False
310
311 351640 152133.0 0.4 1.3 self._foreign_long_watch = False
312 351640 151359.0 0.4 1.3 self._buffer = ""
313 351640 149146.0 0.4 1.3 self._buffer_accent_count = 0
314 121376 47162.0 0.4 0.4 elif (
315 51142 21640.0 0.4 0.2 character not in {"<", ">", "-", "=", "~", "|", "_"}
316 41640 20531.0 0.5 0.2 and character.isdigit() is False
317 28594 13343.0 0.5 0.1 and is_symbol(character)
318 ):
319 16366 7102.0 0.4 0.1 self._is_current_word_bad = True
320 16366 9757.0 0.6 0.1 self._buffer += character commit Total time: 10.8087 s
File: /charset_normalizer-pullrequest/charset_normalizer/md.py
Function: feed at line 266
Line # Hits Time Per Hit % Time Line Contents
==============================================================
266 @profile
267 def feed(self, character: str) -> None:
268 2494830 1190294.0 0.5 11.0 if character.isalpha():
269 1784406 929460.0 0.5 8.6 self._buffer += character
270 #self._buffer = "".join([self._buffer, character])
271 1784406 827987.0 0.5 7.7 if is_accentuated(character):
272 58990 28353.0 0.5 0.3 self._buffer_accent_count += 1
273 5236026 1927087.0 0.4 17.8 if (
274 1784406 714165.0 0.4 6.6 self._foreign_long_watch is False
275 2849962 1273217.0 0.4 11.8 and (is_latin(character) is False or is_accentuated(character))
276 163719 84297.0 0.5 0.8 and is_cjk(character) is False
277 128145 64199.0 0.5 0.6 and is_hangul(character) is False
278 110994 53247.0 0.5 0.5 and is_katakana(character) is False
279 106144 50917.0 0.5 0.5 and is_hiragana(character) is False
280 92656 44131.0 0.5 0.4 and is_thai(character) is False
281 ):
282 85725 35084.0 0.4 0.3 self._foreign_long_watch = True
283 1784406 645751.0 0.4 6.0 return
284 710424 296269.0 0.4 2.7 if not self._buffer:
285 307642 112609.0 0.4 1.0 return
286 1019091 377127.0 0.4 3.5 if (
287 667451 303871.0 0.5 2.8 character.isspace() or is_punctuation(character) or is_separator(character)
288 351640 136873.0 0.4 1.3 ) and self._buffer:
289 351640 169420.0 0.5 1.6 self._word_count += 1
290 351640 157195.0 0.4 1.5 buffer_length = len(self._buffer) # type: int
291
292 351640 166131.0 0.5 1.5 self._character_count += buffer_length
293
294 351640 142946.0 0.4 1.3 if buffer_length >= 4:
295 217909 105082.0 0.5 1.0 if self._buffer_accent_count / buffer_length > 0.34:
296 4206 1751.0 0.4 0.0 self._is_current_word_bad = True
297 # Word/Buffer ending with a upper case accentuated letter are so rare,
298 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
299 217909 121442.0 0.6 1.1 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
300 1716 851.0 0.5 0.0 self._foreign_long_count += 1
301 1716 731.0 0.4 0.0 self._is_current_word_bad = True
302 351640 140858.0 0.4 1.3 if buffer_length >= 24 and self._foreign_long_watch:
303 183 100.0 0.5 0.0 self._foreign_long_count += 1
304 183 88.0 0.5 0.0 self._is_current_word_bad = True
305
306 351640 138198.0 0.4 1.3 if self._is_current_word_bad:
307 11671 5325.0 0.5 0.0 self._bad_word_count += 1
308 11671 5951.0 0.5 0.1 self._bad_character_count += len(self._buffer)
309 11671 4690.0 0.4 0.0 self._is_current_word_bad = False
310
311 351640 147311.0 0.4 1.4 self._foreign_long_watch = False
312 351640 142377.0 0.4 1.3 self._buffer = ""
313 351640 148818.0 0.4 1.4 self._buffer_accent_count = 0
314 121376 45042.0 0.4 0.4 elif (
315 51142 20687.0 0.4 0.2 character not in {"<", ">", "-", "=", "~", "|", "_"}
316 41640 19716.0 0.5 0.2 and character.isdigit() is False
317 28594 13002.0 0.5 0.1 and is_symbol(character)
318 ):
319 16366 6764.0 0.4 0.1 self._is_current_word_bad = True
320 16366 9258.0 0.6 0.1 self._buffer += character |
I'm done. |
Everything looks good. Let's proceed/move on. A bit of caution with the expressed delays by GHA. We never know what VM is behind or how scaled. |
Performance comparison181 : #181 / performance test Summary |
pprofile tests
test.py
Before
Time: 838.97 s.
cachegrind.out.original.zip
Merged
Time: 716.45 s.
cachegrind.out.commits.zip