-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwoscop.py
71 lines (59 loc) · 3.05 KB
/
woscop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
This script takes a Scopus and Web of Science file and merges them to a single dataset,
removing duplicates. For further information, see README.md
Share without any restrictions.
"""
import csv
import re
print("Usage: python3 woscop.py [scopus filename.csv] [wos filename .tsv]\n" + "-" * 40)
from sys import argv
script, scopusfile, wosfile = argv
scopuscsv = open(scopusfile,'r') # change file-name here
scopusdata = csv.reader(scopuscsv, delimiter=',', quotechar='"')
wostsv = open(wosfile, 'r')
wosdata = csv.reader(wostsv, delimiter='\t')
#For debugging only, verify with your original download of records.
scopuscount = 0
woscount = 0
recordlist = [] #Holds the extracted data from the loops
therecords = {} #Takes the duplicate check string as key and the rest of the data as value
#Loops to extract the duplicate check string and the desired fields in the data.
for s in scopusdata:
scopuscount += 1 #Just to count
#print(s[0]) #print whatever you want to add. See headers in the csv file
stitlelowered = s[1].lower() #just making lower cases
ssplitted = stitlelowered.split() #split up the words in the title
sfirstsevenwords = ssplitted[0:6] # add only the first seven words to avoid dual language titles
sjoined = ''.join(sfirstsevenwords) # join back again.
stitlenonspecialchar = re.sub(r'[^A-Za-z0-9]+',r'',sjoined) # remove everything except words and numbers
recordlist.append([stitlenonspecialchar, s[0], s[2], s[1], s[2], s[3], s[4]]) # put everything you want in a list
for w in wosdata:
woscount += 1
#print(w)
#print(w[1])
titlelowered = w[8].lower()
splitted = titlelowered.split()
firstsevenwords = splitted[0:6]
joined = ''.join(firstsevenwords)
titlenonspecialchar = re.sub(r'[^A-Za-z0-9]+',r'',joined)
recordlist.append([titlenonspecialchar, w[1], w[44], w[8], w[9], w[45], w[46]])
# Takes the duplicate check string as a key in the dictionary and the rest as value
for r in recordlist:
therecords.update({r[0]: [r[1], r[2], r[3], r[4], r[5], r[6]]})
# This removes duplicates by adding only if the duplicate checker does NOT exist in the result dict.
result = {}
for key, value in therecords.items():
if key not in list(result.values()):
result[key] = value
# Open and write to a new csv.
with open('output.csv', 'w') as csvfile:
fieldnames = ['Author', 'Year', 'Title', 'Journal', 'Volume', 'Issue'] #add here whatever you need
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quotechar='"')
writer.writeheader()
for key, value in sorted(result.items()):
#print(value[0])
writer.writerow({'Author': value[0], 'Year': value[1], 'Title': value[2], 'Journal': value[3], 'Volume': value[4], 'Issue': value[5]}) # Then add here also
#Print some control information
print("There were originally " + str(scopuscount - 1) + " Scopus records and " + str(woscount - 1) + " WoS records.")
print("Duplicates excluded, there are now " + str(len(therecords)) + " records.")
print("Writing to file " + str(len(result)) + " records.\n File written: output.csv")