From fbcc082767fb3f5711d7d64787b353bc0faa3117 Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 20:45:48 -0400 Subject: [PATCH 01/10] Use heap-based top-k for python to bring into parity with Rust --- python/related.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/related.py b/python/related.py index 287ee6ef..f6b75f0b 100644 --- a/python/related.py +++ b/python/related.py @@ -1,3 +1,4 @@ +import heapq import json from typing import Dict @@ -51,12 +52,10 @@ def main(): related_posts_dict[related_post] = 0 related_posts_dict[related_post] += 1 - sorted_posts = sorted( - related_posts_dict.items(), key=lambda x: x[1], reverse=True - ) - - num = min(5, len(sorted_posts)) - top_posts = [p[0].to_dict() for p in sorted_posts[:num]] + top_posts = [ + p.to_dict() + for p in heapq.nlargest(5, related_posts_dict, key=related_posts_dict.get) + ] all_related_posts.append( { From 5b595316add5530ebc9cfb030ccc486176bac33a Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 21:07:34 -0400 Subject: [PATCH 02/10] Switch to index based dictionary instead --- python/related.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/python/related.py b/python/related.py index f6b75f0b..66e93e42 100644 --- a/python/related.py +++ b/python/related.py @@ -36,26 +36,35 @@ def main(): posts = json.load(f) tag_map = {} - for post in posts: + for idx, post in enumerate(posts): for tag in post["tags"]: if tag not in tag_map: tag_map[tag] = [] - tag_map[tag].append(Post(post["_id"], post["title"], post["tags"])) + tag_map[tag].append(idx) all_related_posts = [] - for post in posts: + for this_post_idx, post in enumerate(posts): related_posts_dict: Dict[Post, int] = {} for tag in post["tags"]: for related_post in tag_map[tag]: - if related_post._id != post["_id"]: + if related_post != this_post_idx: if related_post not in related_posts_dict: related_posts_dict[related_post] = 0 related_posts_dict[related_post] += 1 - top_posts = [ - p.to_dict() - for p in heapq.nlargest(5, related_posts_dict, key=related_posts_dict.get) - ] + top_posts_idx = heapq.nlargest( + 5, related_posts_dict, key=related_posts_dict.get + ) + top_posts = [] + for idx in top_posts_idx: + tp = posts[idx] + top_posts.append( + { + "_id": tp["_id"], + "title": tp["title"], + "tags": tp["tags"], + } + ) all_related_posts.append( { From 8fcf8ecda4f7f4ad16b157574520e27557360b65 Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 21:12:42 -0400 Subject: [PATCH 03/10] Revert "Switch to index based dictionary instead" This reverts commit 5b595316add5530ebc9cfb030ccc486176bac33a. --- python/related.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/python/related.py b/python/related.py index 66e93e42..f6b75f0b 100644 --- a/python/related.py +++ b/python/related.py @@ -36,35 +36,26 @@ def main(): posts = json.load(f) tag_map = {} - for idx, post in enumerate(posts): + for post in posts: for tag in post["tags"]: if tag not in tag_map: tag_map[tag] = [] - tag_map[tag].append(idx) + tag_map[tag].append(Post(post["_id"], post["title"], post["tags"])) all_related_posts = [] - for this_post_idx, post in enumerate(posts): + for post in posts: related_posts_dict: Dict[Post, int] = {} for tag in post["tags"]: for related_post in tag_map[tag]: - if related_post != this_post_idx: + if related_post._id != post["_id"]: if related_post not in related_posts_dict: related_posts_dict[related_post] = 0 related_posts_dict[related_post] += 1 - top_posts_idx = heapq.nlargest( - 5, related_posts_dict, key=related_posts_dict.get - ) - top_posts = [] - for idx in top_posts_idx: - tp = posts[idx] - top_posts.append( - { - "_id": tp["_id"], - "title": tp["title"], - "tags": tp["tags"], - } - ) + top_posts = [ + p.to_dict() + for p in heapq.nlargest(5, related_posts_dict, key=related_posts_dict.get) + ] all_related_posts.append( { From d0f9882a6220ff9a4ee94459b81c2ed9772a86bb Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 21:20:39 -0400 Subject: [PATCH 04/10] Use indexing instead of storing full dict --- python/related.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/related.py b/python/related.py index f6b75f0b..2462f76f 100644 --- a/python/related.py +++ b/python/related.py @@ -36,24 +36,24 @@ def main(): posts = json.load(f) tag_map = {} - for post in posts: + for idx, post in enumerate(posts): for tag in post["tags"]: if tag not in tag_map: tag_map[tag] = [] - tag_map[tag].append(Post(post["_id"], post["title"], post["tags"])) + tag_map[tag].append(idx) all_related_posts = [] - for post in posts: + for this_post_idx, post in enumerate(posts): related_posts_dict: Dict[Post, int] = {} for tag in post["tags"]: for related_post in tag_map[tag]: - if related_post._id != post["_id"]: + if related_post != this_post_idx: if related_post not in related_posts_dict: related_posts_dict[related_post] = 0 related_posts_dict[related_post] += 1 top_posts = [ - p.to_dict() + {k: posts[p][k] for k in ("_id", "title", "tags")} for p in heapq.nlargest(5, related_posts_dict, key=related_posts_dict.get) ] From 2477da68e425c57fb263ae4ca731329eae0b6992 Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 21:46:04 -0400 Subject: [PATCH 05/10] fix type signature of updated related_posts_dict --- python/related.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/related.py b/python/related.py index 2462f76f..ca2278ab 100644 --- a/python/related.py +++ b/python/related.py @@ -44,7 +44,7 @@ def main(): all_related_posts = [] for this_post_idx, post in enumerate(posts): - related_posts_dict: Dict[Post, int] = {} + related_posts_dict: Dict[int, int] = {} for tag in post["tags"]: for related_post in tag_map[tag]: if related_post != this_post_idx: From cf4d86fbb8e4d9d2cdb61af252e748e2bc75798a Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 21:55:40 -0400 Subject: [PATCH 06/10] use same array-based approach instead of dict --- python/related.py | 10 +++++----- run.sh | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/related.py b/python/related.py index ca2278ab..16f4da53 100644 --- a/python/related.py +++ b/python/related.py @@ -44,17 +44,17 @@ def main(): all_related_posts = [] for this_post_idx, post in enumerate(posts): - related_posts_dict: Dict[int, int] = {} + related_posts_list = [0] * len(posts) for tag in post["tags"]: for related_post in tag_map[tag]: if related_post != this_post_idx: - if related_post not in related_posts_dict: - related_posts_dict[related_post] = 0 - related_posts_dict[related_post] += 1 + related_posts_list[related_post] += 1 top_posts = [ {k: posts[p][k] for k in ("_id", "title", "tags")} - for p in heapq.nlargest(5, related_posts_dict, key=related_posts_dict.get) + for p in heapq.nlargest( + 5, range(len(posts)), key=lambda x: related_posts_list[x] + ) ] all_related_posts.append( diff --git a/run.sh b/run.sh index 0e5a7eb3..f0cbdec0 100755 --- a/run.sh +++ b/run.sh @@ -16,7 +16,7 @@ run_go() { if [ $HYPER == 1 ]; then command hyperfine -r 10 --show-output "./related" else - command time -f '%es %Mk' ./related + command /usr/bin/time -f '%es %Mk' ./related fi } @@ -25,7 +25,7 @@ run_rust() { cd ./rust && cargo build --release && if [ $HYPER == 1 ]; then - command hyperfine -r 10 --show-output "./target/release/rust" + command hyperfine -r 40 --show-output "./target/release/rust" else command time -f '%es %Mk' ./target/release/rust fi From 43b0eef2351152e6ff53bb348f5083a994247266 Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 22:07:04 -0400 Subject: [PATCH 07/10] Use collections.Counter for counting a bit faster --- python/related.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/related.py b/python/related.py index 16f4da53..c4ff95a9 100644 --- a/python/related.py +++ b/python/related.py @@ -1,5 +1,6 @@ import heapq import json +from collections import Counter from typing import Dict @@ -44,16 +45,13 @@ def main(): all_related_posts = [] for this_post_idx, post in enumerate(posts): - related_posts_list = [0] * len(posts) - for tag in post["tags"]: - for related_post in tag_map[tag]: - if related_post != this_post_idx: - related_posts_list[related_post] += 1 + related_posts_list2 = Counter((p for tag in post["tags"] for p in tag_map[tag])) + related_posts_list2[this_post_idx] = 0 top_posts = [ {k: posts[p][k] for k in ("_id", "title", "tags")} for p in heapq.nlargest( - 5, range(len(posts)), key=lambda x: related_posts_list[x] + 5, related_posts_list2, key=lambda x: related_posts_list2[x] ) ] From b099760c69b7972e42171621a66bf12d9c4d47f3 Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 22:12:50 -0400 Subject: [PATCH 08/10] Clean up - remove unused import, fix list title --- python/related.py | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/python/related.py b/python/related.py index c4ff95a9..a4b653ed 100644 --- a/python/related.py +++ b/python/related.py @@ -1,35 +1,6 @@ import heapq import json from collections import Counter -from typing import Dict - - -class Post: - def __init__(self, _id, title, tags): - self._id = _id - self.title = title - self.tags = tags - - def __hash__(self): - return hash(self._id) - - def __eq__(self, other): - if isinstance(other, Post): - return self._id == other._id - return False - - def to_dict(self): - return { - "_id": self._id, - "title": self.title, - "tags": self.tags, - } - - -class PostWithSharedTags: - def __init__(self, post, shared_tags): - self.post = post - self.shared_tags = shared_tags def main(): @@ -45,13 +16,13 @@ def main(): all_related_posts = [] for this_post_idx, post in enumerate(posts): - related_posts_list2 = Counter((p for tag in post["tags"] for p in tag_map[tag])) - related_posts_list2[this_post_idx] = 0 + related_posts_list = Counter((p for tag in post["tags"] for p in tag_map[tag])) + related_posts_list[this_post_idx] = 0 top_posts = [ {k: posts[p][k] for k in ("_id", "title", "tags")} for p in heapq.nlargest( - 5, related_posts_list2, key=lambda x: related_posts_list2[x] + 5, related_posts_list, key=lambda x: related_posts_list[x] ) ] From eb6f68b95955b5dbd6cb4abd1d2513df8807f6cd Mon Sep 17 00:00:00 2001 From: Dave Andersen Date: Sat, 23 Sep 2023 22:16:41 -0400 Subject: [PATCH 09/10] optimize dictionary access by heapq (1.7 -> 1.4s) --- python/related.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/related.py b/python/related.py index a4b653ed..3217550b 100644 --- a/python/related.py +++ b/python/related.py @@ -21,9 +21,7 @@ def main(): top_posts = [ {k: posts[p][k] for k in ("_id", "title", "tags")} - for p in heapq.nlargest( - 5, related_posts_list, key=lambda x: related_posts_list[x] - ) + for p in heapq.nlargest(5, related_posts_list, key=related_posts_list.get) ] all_related_posts.append( From b1d776a4bbf6281088edd1fd0b32f2990c148251 Mon Sep 17 00:00:00 2001 From: jinyus <30532952+jinyus@users.noreply.github.com> Date: Sat, 23 Sep 2023 22:57:36 -0500 Subject: [PATCH 10/10] Update run.sh --- run.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/run.sh b/run.sh index f0cbdec0..83628bbb 100755 --- a/run.sh +++ b/run.sh @@ -12,11 +12,10 @@ run_go() { echo "Running Go" && cd ./go && go build && - #command time -f '%es %Mk' ./related if [ $HYPER == 1 ]; then command hyperfine -r 10 --show-output "./related" else - command /usr/bin/time -f '%es %Mk' ./related + command time -f '%es %Mk' ./related fi } @@ -25,7 +24,7 @@ run_rust() { cd ./rust && cargo build --release && if [ $HYPER == 1 ]; then - command hyperfine -r 40 --show-output "./target/release/rust" + command hyperfine -r 20 --show-output "./target/release/rust" else command time -f '%es %Mk' ./target/release/rust fi @@ -46,7 +45,7 @@ run_python() { echo "Running Python" && cd ./python && if [ $HYPER == 1 ]; then - command hyperfine -r 1 "python3 ./related.py" + command hyperfine -r 2 --show-output "python3 ./related.py" else command time -f '%es %Mk' python3 ./related.py fi