-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTermsOfService.py
144 lines (112 loc) · 6.04 KB
/
TermsOfService.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 10 10:33:02 2015
@author: k_schinkel
"""
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import re # only for use of regex flags in pandas str methods
#locations of terms of service
urls = ["http://policies.yahoo.com/us/en/yahoo/terms/utos/index.htm", "https://www.google.com/intl/en/policies/terms/",
"https://help.github.com/articles/github-terms-of-service/",
"https://wikimediafoundation.org/wiki/Terms_of_Use", "http://www.amazon.com/gp/help/customer/display.html/?ie=UTF8&nodeId=508088",
"https://pages.soundcloud.com/en/legal/terms-of-use.html",
"https://twitter.com/tos?lang=en",
"https://cloudant.com/terms/", "http://www.apple.com/legal/internet-services/icloud/en/terms.html"]
urls2 = ["https://help.instagram.com/478745558852511"]
urls3 = ["https://www.netflix.com/TermsOfUse", "https://www.facebook.com/legal/terms",
"https://www.youtube.com/t/terms"]
#corresponding companies
companies = ["Yahoo", "Google", "GitHub", "Wikipedia", "Amazon", "SoundCloud", "Twitter", "Cloudant", "iCloud"]
companies2 = ["Instagram"]
companies3 = ["Netflix", "Facebook", "Youtube"]
#create empty data frame
df = pd.DataFrame()
#for each url, open the site, extract soup, find all paragraphs
#then, for each paragraph, get text and add the text and company name to the dataframe
for url in urls:
index = urls.index(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
para = list(soup.find_all('p'))
for paragraph in para:
paraText = paragraph.get_text()
paraText = paraText.strip()
if (paraText != ""):
capsCount = sum(1 for x in paraText if x.isupper())
rowValue = pd.Series([companies[index], paraText, len(paraText),
paraText.count(' '), capsCount])
df = df.append(rowValue, ignore_index=True)
for url in urls2:
index = urls2.index(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
paraTitle = list(soup.find_all('h3'))
for paragraph in paraTitle:
para = str(paragraph.nextSibling.nextSibling)
paraText = BeautifulSoup(para).get_text()
paraText = paraText.strip()
if (paraText != ""):
capsCount = sum(1 for x in paraText if x.isupper())
rowValue = pd.Series([companies2[index], paraText, len(paraText),
paraText.count(' '), capsCount])
df = df.append(rowValue, ignore_index=True)
for url in urls3:
index = urls3.index(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
para = list(soup.find_all('li'))
for paragraph in para:
paraText = paragraph.get_text()
paraText = paraText.strip()
if (paraText != ""):
capsCount = sum(1 for x in paraText if x.isupper())
rowValue = pd.Series([companies3[index], paraText, len(paraText),
paraText.count(' '), capsCount])
df = df.append(rowValue, ignore_index=True)
#rename columns
df.columns = ["Company", "ParagraphText", "ParagraphLength", "SpacesCount", "CapsCount"]
#calculate count of capital letters to length of paragraph value
df["CapToLegthRatio"] = df["CapsCount"] / df["ParagraphLength"]
#calculate location of each paragraph in the terms of service
#first calculate total paragraphs by company
totalParaByCompany = pd.DataFrame(df.groupby(by=df["Company"]).size())[0] # take first column
#get the index of the first occurrence of each company
companies = companies + companies2 + companies3 #put companies all in one vector
firstIndex = []
for company in companies:
firstIndex.append(list(df["Company"]).index(company))
#mark each paragraph with its order in the terms
df["ParaLocation"] = None
for x in range(0, df.shape[0]):
if x == 0:
df.loc[x,"ParaLocation"] = 0 # Made this 0 to mark the position of the beginning (rather than end) of the
# paragraph in the document. Also changed to .loc reference to assign to a
# df view rather than a copy (error messages were showing up in Ipython)
elif df["Company"][x] == df["Company"][x-1]:
df.loc[x,"ParaLocation"] = df["ParaLocation"][x-1] + 1 # .loc reference for assignment- see above comment
else:
df.loc[x,"ParaLocation"] = 0 # Ditto the comment on the if block
#calculate order/total paragraphs
for x in range(0, df.shape[0]):
company = df["Company"][x]
#companyTotalPara = int(str(totalParaByCompany.loc[company]).split()[1]) #grabs total paragraphs from groupby results
companyTotalPara = totalParaByCompany[company] # this works too
df.loc[x,"ParaLocation"] = df["ParaLocation"][x] / companyTotalPara # .loc reference for assignment
# 0-1 flags for presence of special keywords
df['Arbitration'] = (df.ParagraphText.str.contains('arbitration', flags=re.IGNORECASE)).astype('int8')
df['ThirdParty'] = (df.ParagraphText.str.contains('third[- ]party', flags=re.IGNORECASE)).astype('int8')
df['Waiver'] = (df.ParagraphText.str.contains('waiver', flags=re.IGNORECASE)).astype('int8')
# Some more paragraph stats
df['ParagraphWords'] = (df.ParagraphText.str.count(' ')) + 1 # a space was the best indicator of actual words I could find
# without inserting a great deal of complication.
df['ParagraphSentences'] = (df.ParagraphText.str.count(r'\.[ $]')) # period followed by space or end-of-string
df['Quotes'] = df.ParagraphText.str.count(r'["]')
df['Parentheses'] = df.ParagraphText.str.count(r'[)(]') - 2*df.ParagraphText.str.count(r'\([a-zA-Z0-9]\)')
df['AvgWordLength'] = (df.ParagraphLength - (df.ParagraphWords - 1) - df.ParagraphSentences - df.Quotes -
df.Parentheses)/df.ParagraphWords
# (total chars - #spaces - #periods - #quotechars - #parentheses)/#words
# not exact, but a good approximation.
#print to csv
df.to_csv("TermsOfService.csv", index=False)