forked from faraday/wikiprep-esa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddAnchors.py
103 lines (73 loc) · 2.9 KB
/
addAnchors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python
'''
Copyright (C) 2010 Cagatay Calli <[email protected]>
Adds anchors from Wikiprep output to target Wikipedia articles.
Legacy input format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>
Input format: <Target page id> <Source page id> <Anchor location within text> <Anchor text (up to the end of the line)>
Output format: <Target page id> <Anchor text>
USAGE: addAnchors.py <anchor file from Wikiprep> <any writeable folder>
The folder is used by the script to create data files that are loaded into database.
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
'''
import sys
import MySQLdb
PARTITION_SIZE = 100000
# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
FORMAT = 'Gabrilovich'
if FORMAT == 'Gabrilovich':
FIELD_POS = 2
else:
FIELD_POS = 3
args = sys.argv[1:]
if len(args) < 2:
sys.exit(1)
f = open(args[0],'r')
outFolder = args[1].rstrip('/') + '/'
outPrefix = outFolder + '/zanchor'
out = open(outPrefix + '0','w')
for i in range(3):
f.readline()
lc = 0
outk = 0
for line in f.xreadlines():
fields = line.split('\t')
anc = fields[FIELD_POS].rstrip('\n')
out.write(fields[0] + '\t' + anc + '\n')
lc += 1
if lc >= PARTITION_SIZE:
lc = 0
outk += 1
out.close()
out = open(outPrefix + str(outk),'w')
out.close()
f.close()
if lc > 0:
out.close()
outk += 1
try:
conn = MySQLdb.connect(host='localhost',user='root',passwd='123456',db='wiki',charset = "utf8", use_unicode = True)
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
try:
cursor = conn.cursor()
for i in range(outk):
si = str(i)
cursor.execute("DROP TABLE IF EXISTS zanchor"+si)
cursor.execute("CREATE TABLE zanchor"+si+" (target_id int(10) unsigned, anchor blob)")
cursor.execute("LOAD DATA LOCAL INFILE '"+outPrefix+si+"' INTO TABLE zanchor"+si)
cursor.execute("CREATE INDEX idx_target_id ON zanchor"+si+" (target_id);")
cursor.execute("DROP TABLE IF EXISTS anchorList"+si)
cursor.execute("CREATE TABLE anchorList"+si+" (target_id int(10) unsigned, anchor_text mediumblob)")
cursor.execute("INSERT anchorList"+si+" SELECT a.target_id,GROUP_CONCAT(a.anchor SEPARATOR ' \n ') AS anchor_text FROM zanchor"+si+" a WHERE a.anchor IS NOT NULL GROUP BY a.target_id")
cursor.execute("DROP TABLE zanchor"+si)
# add anchors after creating each partition
cursor.execute("CREATE INDEX idx_target_id ON anchorList"+si+" (target_id);")
cursor.execute("UPDATE text t, anchorList"+si+" a SET t.old_text = CONCAT(a.anchor_text,' \n',t.old_text) WHERE t.old_id = a.target_id AND a.anchor_text IS NOT NULL;")
cursor.execute("DROP TABLE anchorList"+si)
cursor.close()
conn.close()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit (1)