-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathretrieve-sentences.py
executable file
·94 lines (77 loc) · 3.5 KB
/
retrieve-sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
"""
Read one-sentence-per-line from stdin.
Retrieve closest Lucene matches for each sentence, without duplicating retrieved sentences.
(We use a Bloom filter to detect duplicates.)
TODO:
* Detect duplicates in input sentences too?
* Remove this bizarre special case:
querytext = querytext.replace("AND OR", "and or")
querytext = querytext.replace("OR OR", "or or")
querytext = querytext.replace("AND and", "and and")
querytext = querytext.replace("OR AND", "or and")
http://stackoverflow.com/questions/3452162/lucene-queryparser-interprets-and-or-as-a-command
"""
import sys
import string
import sets
from common.stats import stats
from common.str import percent
DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT = 10
BLOOM_FILTER_SIZE = 1000000000
import numpy
import murmur
import lucene
from lucene import \
SimpleFSDirectory, System, File, \
Document, Field, StandardAnalyzer, IndexSearcher, Version, QueryParser
def retrieve(querytext, searcher, queryparser, maxresults=1000):
# querytext = querytext.replace("AND OR", "and or")
# querytext = querytext.replace("OR OR", "or or")
# querytext = querytext.replace("AND AND", "and and")
# querytext = querytext.replace("OR AND", "or and")
querytext = querytext.replace("OR", "or")
querytext = querytext.replace("AND", "and")
querytext = querytext.replace("NOT", "not")
query = queryparser.parse(queryparser.escape(querytext))
# query = QueryParser(analyzer).parse("Find this sentence please")
hits = searcher.search(query, maxresults)
# print >> sys.stderr, "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
for hit in hits.scoreDocs:
# print >> sys.stderr, "\t", hit.score, hit.doc, hit.toString()
doc = searcher.doc(hit.doc)
# print >> sys.stderr, "\t", doc.get("text").encode("utf-8")
yield doc.get("text")
if __name__ == "__main__":
usedsentences = numpy.zeros((BLOOM_FILTER_SIZE,), dtype=numpy.bool)
print >> sys.stderr, "Just created bloom filter with %d entries" % usedsentences.shape[0]
print >> sys.stderr, stats()
lucene.initVM()
# create an index called 'index-dir' in a temp directory
# indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
# 'index-dir')
# indexDir = "/Tmp/REMOVEME.index-dir"
indexDir = "lucene.ukwac"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
searcher = IndexSearcher(dir)
nonzeros = 0
for i, l in enumerate(sys.stdin):
if i % 100 == 0:
print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (i, percent(nonzeros, BLOOM_FILTER_SIZE))
print >> sys.stderr, stats()
l = string.strip(l)
added_this_sentence = 0
for newl in retrieve(l, searcher, queryparser):
# Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break
newl = string.strip(newl)
# Hash the sentence
idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE
# Don't use duplicate sentences
if usedsentences[idx]: continue
usedsentences[idx] = True
nonzeros += 1
added_this_sentence += 1
print newl.encode("utf-8")