-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathimage_duplicates_demo.py
59 lines (48 loc) · 1.43 KB
/
image_duplicates_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description:
"""
import sys
from PIL import Image
sys.path.append('..')
from similarities import ClipSimilarity, paraphrase_mining_embeddings
def load_data(file_path):
data_paths = []
c = 0
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
c += 1
if c == 1:
continue
if line:
path = line.split(',')[0]
data_paths.append(path)
return data_paths
def main():
corpus_paths = load_data('data/image_info.csv')
corpus_paths = list(set(corpus_paths))
print('corpus size:', len(corpus_paths), 'top3:', corpus_paths[:3])
model = ClipSimilarity()
print(model)
corpus = [Image.open(i) for i in corpus_paths]
corpus_embeddings = model.get_embeddings(corpus, show_progress_bar=True, convert_to_tensor=True)
duplicates = paraphrase_mining_embeddings(corpus_embeddings)
for score, idx1, idx2 in duplicates[0:10]:
print("\nScore: {:.3f}".format(score))
print(corpus_paths[idx1])
print(corpus_paths[idx2])
# Score: 0.945
# data/image1.png
# data/image12-like-image1.png
#
# Score: 0.944
# data/image10.png
# data/image11-like-image10.png
#
# Score: 0.932
# data/image8-like-image1.png
# data/image12-like-image1.png
if __name__ == '__main__':
main()