WoSTSVparser.py

"""
Use with Python 3.4

This script has been tested with a file generated by exporting as "TSV Mac UTF-8" from
the Thomson Reuters Web of Science. Change filename in the line below, then you should
be able to parse each field accordingly. Some examples for printing at the end of the
script.

Note: the semi-colon separated fields can easily be separated as the example with 'CR'.
"""

with open('wostestar.tsv','r') as tsv: # change file-name here
    next(tsv) # This skips the first line in the file, which contains the TSV headers
    WoSdata = [line.strip().split('\t') for line in tsv] #reads everything as a list

for W in WoSdata:
    PT = W[0] #Publication Type
    AU = W[1] # Authors
    BA = W[2] # ?
    BE = W[3] # Editors of Proceedings
    GP = W[4] # ?
    AF = W[5] # Authors Full
    BF = W[6] # ?
    CA = W[7] # Group Authors
    TI = W[8] # Title
    SO = W[9] # Source (Journal title, full)
    SE = W[10] #Book Series title
    BS = W[11] # ?
    LA = W[12] # Language
    DT = W[13] # Document Type
    CT = W[14] # Conference Title
    CY = W[15] # Conference Date
    CL = W[16] # Conference Location
    SP = W[17] # Conference Sponsors
    HO = W[18] # Conference Host
    DE = W[19] # Original Keywords
    ID = W[20] # New Keywords by ISI (keywords plus)
    AB = W[21] # Abstract
    C1 = W[22] # Research Addresses: Note [] in fields.
    RP = W[23] # Reprint Address
    EM = W[24] # E-mail (Semi-colon separated)
    RI = W[25] # Researcher ID
    OI = W[26] # ?
    FU = W[27] # Funding agency and grant number
    FX = W[28] # Funding text
    CR = W[29] # Cited references (Semi-colon separated)
    NR = int(W[30]) # Cited reference count (Numerical value)
    TC = int(W[31]) # Times cited (Numerical value)
    Z9 = int(W[32]) # Total times Cited
    U1 = int(W[33]) # ?
    U2 = int(W[34]) # ?
    PU = W[35] # Publisher
    PI = W[36] # Publisher city
    PA = W[37] # Publisher Address
    SN = W[38] # ISSN (String value)
    EI = W[39] # ?
    BN = W[40] # ISBN
    J9 = W[41] # 29 Character Journal Abbreviation
    JI = W[42] # ISO Journal Title Abbreviation
    PD = W[43] # Publication date (mixed string and possible integer value)
    PY = int(W[44]) # Publication Year (Could also be parsed with date module)
    VL = W[45] # Volume (could also be parsed as an integer, but no really useful)
    IS = W[46] # Issue (contains both numerical values and hyphenations)
    PN = W[47] # Part Number
    SU = W[48] # Supplement (number)
    SI = W[49] # Special Issue
    MA = W[50] # ?
    BP = W[51] # Beginning page
    EP = W[52] # End page
    AR = W[53] # Article number of APS journals
    DI = W[54] # DOI Number
    D2 = W[55] # ?
    PG = int(W[56]) # Number of Pages
    WC = W[57] # Research Field
    SC = W[58] # Science Categories?
    GA = W[59] # IDS number, ISI original
    UT = W[60] # WOS ISI unique artile identifier

    #Example Printouts
    print("-" * 30) # Only for "prettier" printout
    print(AU)
    print(PY)
    print(WC)
    #This shows how to open up semi-colon separated fields.
    for citref in CR.split('; '):
        print(citref)