-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateJsonFiles.py
109 lines (90 loc) · 4.24 KB
/
createJsonFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# reducer.py
"""
Before running, make sure you set the scripts folder and output
folder paths below.
Your Mapper.py and Reducer.py should both be within the scripts
folder of your s3 account and must be named 'Mapper.py' and
'Reducer.py' (case sensitive).
The output folder is where the map/reduce resultant files will
be placed.
"""
## SCRIPT AND OUTPUT FOLDER PATHS MUST BE
## CHANGED PRIOR TO RUNNING THIS SCRIPT
scriptsPath = "s3://[S3 Bucket]/[Scripts Folder]/"
outputPath = "s3://[S3 Bucket]/[EMR Output Folder]/"
import sys
import re
import os
def main():
scriptsPathCheck()
inputFileName = getUserInputFile()
outputFileName = getUserOutputFile()
if not detectExistingOutputFile(outputFileName):
with open("./" + inputFileName) as infile:
with open("./" + outputFileName + '.json', 'w+') as outfile:
# Begin a boolean flag
isFirstLine = True
# Print the JSON file header markup
outfile.write("[\n")
lineCounter = 0
# Check for non-URL entries in input file
for line in infile:
if len(line) < 10:
print "Short line in input file, please check/rid your input file of extra blank lines. Exiting..."
sys.exit()
# Parse the elements out of the line
linesplit=line.split("/")
ccMainNumber = linesplit[4]
segmentnumber = linesplit[6]
filerange = linesplit[8].rstrip()
if(isFirstLine == False):
outfile.write (",\n")
isFirstLine = False
# Print the markup for each line
outfile.write ("{\n")
outfile.write ("\"Name\": \"segment_" + segmentnumber + "_file_" + filerange.split("-")[3] + "\",\n")
outfile.write ("\"ActionOnFailure\": \"CONTINUE\",\n")
outfile.write ("\"Jar\": \"/home/hadoop/contrib/streaming/hadoop-streaming.jar\",\n")
outfile.write ("\"Args\":\n")
outfile.write ("[\n")
outfile.write ("\"-files\", " + "\"" + scriptsPath + "Mapper.py," + scriptsPath + "Reducer.py\",\n")
outfile.write ("\"-mapper\", \"Mapper.py\",\n")
outfile.write ("\"-reducer\", \"Reducer.py\",\n")
outfile.write ("\"-input\", \"s3://commoncrawl/crawl-data/" + ccMainNumber + "/segments/" + segmentnumber + "/wet/" + filerange + "\",\n")
outfile.write ("\"-output\"" + ", " + "\"" + outputPath + segmentnumber + "_" + filerange.split("-")[3] + "\"" + ",\n")
outfile.write ("\"-inputformat\", \"SequenceFileAsTextInputFormat\"\n")
outfile.write ("]\n")
outfile.write ("}")
# Print the JSON file footer markup
outfile.write("]\n")
lineCounter += 1
print ccMainNumber
exitMessage()
def getUserInputFile():
file = raw_input("Enter input file: ")
return file
def getUserOutputFile():
file = raw_input("Enter output file: ")
return file
def detectExistingOutputFile(file):
if os.path.isfile(file + '.json'):
decision = raw_input("\nWARNING! If you proceed, you will overwrite your output file. Are you sure?\n[y/n]\n")
if decision == "n":
print "Exiting...\n"
sys.exit()
elif decision =="y":
return False
else:
return False
def exitMessage():
print "\nSuccessfully wrote", lineCounter, "lines to output file:", outputFileName
def scriptsPathCheck():
if scriptsPath == "s3://[S3 Bucket]/[Scripts Folder]/":
print "You need to open and edit the script path of createJsonFiles.py file prior to running. Exiting..."
sys.exit()
elif outputPath == "s3://[S3 Bucket]/[EMR Output Folder]/":
print "You need to open and edit the output path of createJsonFiles.py file prior to running. Exiting..."
sys.exit()
if __name__ == "__main__":
main()