-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport-gh-issues.py
executable file
·174 lines (137 loc) · 6.57 KB
/
export-gh-issues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""
Usage:
python -m export-gh-issues.py -h
Pulls all issues from given github repo(s).
TODO:
I bet this doesn't work when a repo has >30 issues because pagination is not
handled.
"""
import argparse
from datetime import datetime
import json
import logging
from pandas import json_normalize
import requests
import sys
from github_helpers import get_github_headers
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
LOG = logging.getLogger(__name__)
def main(filetype, raw, repos, label):
"""
Script entrypoint
"""
gh_headers = get_github_headers()
stamp = datetime.utcnow().isoformat()[0:19]
repo_names = ','.join(repos)
export_filename = f"output-export/{stamp}-{repo_names}.{filetype}"
is_csv = filetype == 'csv'
get_and_filter_issues(repos, gh_headers, export_filename, raw, label=label, csv=is_csv)
def get_and_filter_issues(all_repos, gh_headers, export_filename, raw, label=None, csv=False):
"""
Get all issues that are not PRs from the specified repo.
* export_filename: the name of the file you'd like your data exported to
* raw: if True, returns all json fields. Otherwise returns a pre-determined filtered list.
* label: only return issues with the given label.
* csv: if True, returns a flattened CSV instead of a json; without filtering, this may not work.
"""
all_issues = []
for repo in all_repos:
LOG.info("grabbing repo {0}".format(repo))
url = "https://api.github.com/repos/openedx/{repo}/issues".format(repo=repo)
all_issues = all_issues + requests.get(url, headers=gh_headers).json()
saved_issues = []
# Variables needed for doing hard-coded filtering of the issue to only the specified fields
# Keys with single values: "key" : "value"
keys_to_save = ["url", "number", "title", "body", "created_at", "updated_at"]
# Keys that have keys nested in a dict: "key": { {nkey: value}, {nkey2: value} }
nested_keyvalues = [{"user": "login"}]
# Keys that follow with a list of dicts: "key": [{nkey: value}, {nkey2: value}]
listed_keyvalues = [{"labels": "name"}, {"assignees": "login"}]
for issue in all_issues:
# all prs are issues, but not all issues are prs. grab just the issues
if 'pull_request' in issue:
continue
# replace api url with github url
issue['url'] = issue['url'].replace('api.', '')
issue['url'] = issue['url'].replace('repos/', '')
# Filter out issues that don't have the given label, but return all issues in the Decoupling project
if label and "decoupling" not in issue["url"]:
# filter on those with only the label
# issues have a key `label` with a list of multiple labels of the form:
# "label": [ {"name": "label1name"}, {"name": "label2name"} ]
# note the inner dicts have, in addition to name, keys: id, node_id, url, color, default, description
match = [1 for item in issue["labels"] if item['name']==label] != []
if not match:
continue
if not raw:
# Extract the fields we need
new_issue = {}
for k in keys_to_save:
v = issue[k]
new_issue[k] = v
for keypair in nested_keyvalues:
k1 = [*keypair][0]
k2 = keypair[k1]
v = issue[k1][k2]
# Flattening the tree for CSV conversion
if csv:
new_issue["{k1}-{k2}".format(k1=k1, k2=k2)] = v
else:
new_issue[k1] = {}
new_issue[k1][k2] = v
for keypair in listed_keyvalues:
k1 = [*keypair][0]
k2 = keypair[k1]
for k1dict in issue[k1]:
v = k1dict[k2]
if csv:
# flattening for csv conversion
dkey = "{k1}-{k2}".format(k1=k1, k2=k2)
if dkey not in new_issue:
new_issue[dkey] = []
new_issue[dkey].append(v)
else:
if k1 not in new_issue:
new_issue[k1] = []
new_issue[k1].append({k2: v})
# once new_issue is built out at end of "filtered" section, rename it for following line
issue = new_issue
# Append the edited issue to the list of issues we're saving
saved_issues.append(issue)
if csv:
issue_dataframe = json_normalize(saved_issues)
issue_dataframe.to_csv(export_filename, index=False)
else:
with open(export_filename, "a") as export_file:
print(json.dumps(saved_issues, indent=4), file=export_file)
LOG.info("Successfully wrote issues to: {0}".format(export_filename))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Use this script to collect issues (that aren't PRs) from one or more openedx GitHub repos.\
GitHub provides a large number of fields on an issue; by default, this script filters those to\
a small number of useful ones. Use the raw flag to see all available fields.\
Optionally, provide a label to get issues only with that label.")
parser.add_argument('filetype',
help="can be one of `json` or `csv`",
default='export_filtered')
parser.add_argument('-r', '--raw',
help="If flagged, issues will be exported with all fields present.",
action='store_true')
parser.add_argument('repos',
help="One or more openedx repos to grab issues from",
nargs='+')
parser.add_argument('-l', '--label',
help='Only return GitHub issues with this label.')
args = parser.parse_args()
# validation
if args.filetype not in ['csv', 'json']:
sys.exit("Filetype must be one of: `csv`, `json`")
# raw output is untested with raw option... but allow people to try anyway
if args.raw and args.filetype == 'csv':
LOG.warning('filetype=csv and -r option unsupported')
proceed = input("Raw output is untested with CSV option (json nesting level may be too deep). Proceed anyway? [y/n]: ")
if proceed != 'y':
LOG.info('Exiting program, not proceeding with raw output/csv')
sys.exit()
main(args.filetype, args.raw, args.repos, args.label)