-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_empty_journals.py
86 lines (66 loc) · 2.11 KB
/
find_empty_journals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Find which journals are missing articles
import settings, requests, json
def find_empty_journals():
J_SEARCH = settings.ES_INDEX + "/journal/_search"
j_query = \
{
"query" : {
"bool" : {
"must" : [
{ "term" : {"_type" : "journal"} },
{"match" : {"index.country" : "United Kingdom"} }
]
}
},
"facets" : {
"issns" : {
"terms" : {
"field" : "index.issn.exact"
}
}
}
}
RES_SIZE = get_count(J_SEARCH, j_query)
j_query['facets']['issns']['terms']['size'] = RES_SIZE
(j_issns, total_j_issns) = query_for_issns(J_SEARCH, j_query)
print "Number of journal ISSNs: {0}\t Unique: {1}".format(total_j_issns, len(j_issns))
A_SEARCH = settings.ES_INDEX + "/article/_search"
a_query = \
{
"query" : {
"bool" : {
"must" : [
{ "term" : {"_type" : "article"} },
{"match" : {"index.country" : "United Kingdom"} }
]
}
},
"facets" : {
"issns" : {
"terms" : {
"field" : "index.issn.exact"
}
}
}
}
RES_SIZE = get_count(A_SEARCH, a_query)
a_query['facets']['issns']['terms']['size'] = RES_SIZE
(a_issns, total_a_issns) = query_for_issns(A_SEARCH, a_query)
print "Number of article ISSNs: {0}\t Unique: {1}".format(total_a_issns, len(a_issns))
diff = j_issns.difference(a_issns)
print "Journals without articles: {0}".format(len(diff))
return list(diff)
def query_for_issns(url, query):
result_set = set()
resp = requests.get(url, data=json.dumps(query))
results = resp.json()['facets']['issns']
total_issns = results['total']
if total_issns > 0:
for result in results['terms']:
result_set.add(result['term'])
return (result_set, total_issns)
def get_count(url, query):
resp = requests.get(url, data=json.dumps(query))
return resp.json()['facets']['issns']['total']
if __name__ == '__main__':
find_empty_journals()