-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCSV Parsing.py
127 lines (69 loc) · 2.33 KB
/
CSV Parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# coding: utf-8
# pandas 0.23.4
# - important module in Data Analysis in Python
#
# pycountry 18.12.8
# - provides the ISO databases.
#
# In[32]:
import pandas as pd
import pycountry as pc
# Open `plik.csv` as dataframe `plik` without header and with `,` as separator in csv
# In[22]:
plik=pd.read_csv("plik.csv", header=None, sep=",")
# Print `plik` with a large file, display the first 5 results `plik.head()`
# In[23]:
plik
# It gives new headings for ease of operation:
# - Data
# - Subdivisions
# - Number
# - Percent
# In[24]:
plik.columns = ['Data', 'Subdivisions', 'Number','Percent']
# change the column Data to datatime format
# In[25]:
plik['Data'] = pd.to_datetime(plik['Data'], format='%m/%d/%Y')
# change the column Percent to float without "%"
# In[26]:
plik['Percent'] = (plik['Percent'].str.rstrip('%')).astype('float')
# Create new column `CountryCode` with country code format `XXX` for subdivisions
# In[27]:
def get_country_code(name):
"""find the country code to the subdivisions name"""
for co in pc.subdivisions:
if name in co.name:
return co.country_code
return 'XXX'
def get_country_code_alpha_3(name):
"""find the country code format XXX to country code format XX"""
for co in pc.countries:
if name == 'XXX':
return 'XXX'
if name in co.alpha_2:
return co.alpha_3
return 'XXX'
kod=[]
names = plik['Subdivisions']
for name in names:
code = get_country_code(name)
kod.append(code)
names2 = kod
kod_alpha_3 = []
for name in names2:
code = get_country_code_alpha_3(name)
kod_alpha_3.append(code)
plik['CountryCode'] = kod_alpha_3
# Create new column `Clicks` that are the value of equation `( number * percent )/100`
# In[28]:
plik.insert(loc=5, column='Clicks', value=(round(((plik.Number * plik.Percent)/100),0)).astype('int'))
# Group by columns `Data` and `CountryCode`
# In[29]:
group = plik.groupby(by=["Data", 'CountryCode'])
# Print new group with sum of values in rows for columns `Number` and `Click`
# In[30]:
group['Number', 'Clicks'].sum().head()
# Write group to `new.csv` with separator `,`, with new header, without index. New plik will be saved in the same folder as this notebook.
# In[31]:
group['Number', 'Clicks'].sum().reset_index().to_csv('new.csv')