-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode_release_analysis.py
129 lines (104 loc) · 4.22 KB
/
code_release_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
from collections import defaultdict
import argparse
IGNORE_PART = ['department', 'school']
IGNORE_FULL = ['AI', 'arXiv']
STOPWORDS = [
'the', 'de', ',',
'artificial', 'intelligence', 'automation'
]
def print_ranked_freq(freq, top=10, reverse=True, sort_key=None):
if sort_key is None:
sort_key = lambda i: i[1]
i = 0
for k, v in sorted(freq.items(),
key=sort_key, reverse=reverse):
print(f'{i+1}. {k} {v}')
i += 1
if i >= top:
break
def frequency_per_code_release(data, words=False):
code_freq = defaultdict(int)
no_code_freq = defaultdict(int)
for item in data:
code_released = item['code']
freq = code_freq if code_released else no_code_freq
affiliations = item['affiliations']
affiliations = [aff for aff in affiliations if not (
any([(ig in aff.lower()) for ig in IGNORE_PART]) or
any([aff == ig for ig in IGNORE_FULL]))]
if words:
aff_words = set((' '.join(affiliations)).split(' '))
for word in aff_words:
if any([word.lower() == sw for sw in STOPWORDS]):
continue
freq[word] += 1
else:
# Remove duplicates
affiliations = set(affiliations)
for aff in affiliations:
freq[aff] += 1
return code_freq, no_code_freq
def rank_frequency_per_code_release(data, top=10, **kwargs):
code_freq, no_code_freq = frequency_per_code_release(data, **kwargs)
print("\nPapers with code\n")
print_ranked_freq(code_freq, top=top)
print("\nPapers without code\n")
print_ranked_freq(no_code_freq, top=top)
def compute_relative_code_release_frequency(code_freq, no_code_freq, min_count=0):
rel_freq = defaultdict(tuple)
for k, v in code_freq.items():
v2 = no_code_freq[k]
v_tot = v + v2
if v_tot > min_count:
rel_freq[k] = (round(v / (v_tot), 2), v_tot)
return rel_freq
def rank_code_release_frequency_per_entity(data, top=10, min_count=0,
print_all=False, **kwargs):
code_freq, no_code_freq = frequency_per_code_release(data, **kwargs)
rel_freq = compute_relative_code_release_frequency(code_freq, no_code_freq, min_count)
if print_all:
print_ranked_freq(rel_freq, top=len(rel_freq))
else:
print("\nLowest fraction of papers with code\n")
sort_key_low = lambda k: (k[1][0], -k[1][1])
print_ranked_freq(rel_freq, top=top, reverse=False, sort_key=sort_key_low)
print("\nHighest fraction of papers with code\n")
print_ranked_freq(rel_freq, top=top, reverse=True)
def rank_pubs_per_code_release(data, top=10, min_count=0, **kwargs):
code_freq, no_code_freq = frequency_per_code_release(data, **kwargs)
rel_freq = compute_relative_code_release_frequency(code_freq, no_code_freq, min_count)
sort_key = lambda i: i[1][1]
print("\nCode release fraction for biggest publishers\n")
print_ranked_freq(rel_freq, top=top, reverse=True, sort_key=sort_key)
def main(args):
with open(args.input, 'r') as f:
data = json.load(f)
if args.full:
# Full list
rank_code_release_frequency_per_entity(data, print_all=True)
return
print('======================')
print('Code release fraction')
print('======================\n')
print('Format: rank. name (fraction, #papers)')
print('\nInstitutions')
print('============')
rank_code_release_frequency_per_entity(data, top=20, min_count=1)
print('\nWords (#papers >= 5)')
print('=====')
rank_code_release_frequency_per_entity(data, top=20, min_count=5, words=True)
print('\nBiggest publishers')
print('==================')
rank_pubs_per_code_release(data, top=20)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str,
default='./out/neurips_2019/affiliations.json',
help='Path to affiliation JSON data')
parser.add_argument('-f', '--full', action='store_true',
help='Report full list of affiliations')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
main(args)