-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathWoSTSVparser.py
86 lines (81 loc) · 3.05 KB
/
WoSTSVparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Use with Python 3.4
This script has been tested with a file generated by exporting as "TSV Mac UTF-8" from
the Thomson Reuters Web of Science. Change filename in the line below, then you should
be able to parse each field accordingly. Some examples for printing at the end of the
script.
Note: the semi-colon separated fields can easily be separated as the example with 'CR'.
"""
with open('wostestar.tsv','r') as tsv: # change file-name here
next(tsv) # This skips the first line in the file, which contains the TSV headers
WoSdata = [line.strip().split('\t') for line in tsv] #reads everything as a list
for W in WoSdata:
PT = W[0] #Publication Type
AU = W[1] # Authors
BA = W[2] # ?
BE = W[3] # Editors of Proceedings
GP = W[4] # ?
AF = W[5] # Authors Full
BF = W[6] # ?
CA = W[7] # Group Authors
TI = W[8] # Title
SO = W[9] # Source (Journal title, full)
SE = W[10] #Book Series title
BS = W[11] # ?
LA = W[12] # Language
DT = W[13] # Document Type
CT = W[14] # Conference Title
CY = W[15] # Conference Date
CL = W[16] # Conference Location
SP = W[17] # Conference Sponsors
HO = W[18] # Conference Host
DE = W[19] # Original Keywords
ID = W[20] # New Keywords by ISI (keywords plus)
AB = W[21] # Abstract
C1 = W[22] # Research Addresses: Note [] in fields.
RP = W[23] # Reprint Address
EM = W[24] # E-mail (Semi-colon separated)
RI = W[25] # Researcher ID
OI = W[26] # ?
FU = W[27] # Funding agency and grant number
FX = W[28] # Funding text
CR = W[29] # Cited references (Semi-colon separated)
NR = int(W[30]) # Cited reference count (Numerical value)
TC = int(W[31]) # Times cited (Numerical value)
Z9 = int(W[32]) # Total times Cited
U1 = int(W[33]) # ?
U2 = int(W[34]) # ?
PU = W[35] # Publisher
PI = W[36] # Publisher city
PA = W[37] # Publisher Address
SN = W[38] # ISSN (String value)
EI = W[39] # ?
BN = W[40] # ISBN
J9 = W[41] # 29 Character Journal Abbreviation
JI = W[42] # ISO Journal Title Abbreviation
PD = W[43] # Publication date (mixed string and possible integer value)
PY = int(W[44]) # Publication Year (Could also be parsed with date module)
VL = W[45] # Volume (could also be parsed as an integer, but no really useful)
IS = W[46] # Issue (contains both numerical values and hyphenations)
PN = W[47] # Part Number
SU = W[48] # Supplement (number)
SI = W[49] # Special Issue
MA = W[50] # ?
BP = W[51] # Beginning page
EP = W[52] # End page
AR = W[53] # Article number of APS journals
DI = W[54] # DOI Number
D2 = W[55] # ?
PG = int(W[56]) # Number of Pages
WC = W[57] # Research Field
SC = W[58] # Science Categories?
GA = W[59] # IDS number, ISI original
UT = W[60] # WOS ISI unique artile identifier
#Example Printouts
print("-" * 30) # Only for "prettier" printout
print(AU)
print(PY)
print(WC)
#This shows how to open up semi-colon separated fields.
for citref in CR.split('; '):
print(citref)