forked from MiguelVieira/WWF-Ecoregions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflickrGroup.py
116 lines (77 loc) · 2.64 KB
/
flickrGroup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import flickrapi
import xml.etree.ElementTree as ET
import pickle
import ecoInfo
import string
with open('flickrKey.txt', 'r') as f:
key = f.readline()
groupId = '1334707@N21'
flickr = flickrapi.FlickrAPI(key)
def getTags(photoXml):
return [tag.text for tag in photoXml.find('tags')]
def getUrl(photoXml):
return photoXml.find('urls').find('url').text
def getPhoto(photoId):
return flickr.photos_getInfo(api_key = key, photo_id = photoId)[0]
def getGroupPhotos(groupId, pageIndex, photoCount):
print "getGroupPhotos page " + str(pageIndex)
return flickr.groups_pools_getPhotos(group_id = groupId, page = pageIndex, per_page = photoCount)[0]
def getPhotoIds(groupXml):
return [p.attrib['id'] for p in groupXml]
def getPageCount(groupXml):
return int(groupXml.attrib['pages'])
def getPhotosIdsFromApi():
photoCount = 500
page = 1
pageXml = getGroupPhotos(groupId, page, photoCount)
pageCount = getPageCount(pageXml)
photoIds = getPhotoIds(pageXml)
for pageIndex in range(2, pageCount + 1):
photoIds.extend(getPhotoIds(getGroupPhotos(groupId, pageIndex, photoCount)))
picklePhotoIds(photoIds)
return photoIds
def normalizeEcoregion(e):
e = e.lower().replace(' ', '').replace('-','').replace(',','').replace('.','')
e = filter(lambda x: x in string.printable, e)
return e
pickleFile = "pickle.bin"
def unpicklePhotoIds():
return pickle.load(open(pickleFile))
def picklePhotoIds(photoIds):
pickle.dump(photoIds, open(pickleFile, "wb"))
def isEcoregion(tag):
return normalizeEcoregion(tag) in normalizedEcoregions
def ecoregionsInTags(tags):
return filter(isEcoregion, tags)
def getECodes(ecoregions):
return [normalizedEcoregions[normalizeEcoregion(e)] for e in ecoregions]
normalizedEcoregions = dict()
for c in ecoInfo.getAllECodes():
ecoregion = ecoInfo.getEcoregion(c)
normalizedEcoregions[normalizeEcoregion(ecoregion)] = c
photoIds= getPhotosIdsFromApi()
badPhotos = list()
eCodes = set()
count = 1
max = 10000
for photoId in photoIds:
print "processing photo " + str(count) + " of " + str(len(photoIds))
count = count + 1
if (count >= max):
break
photoXml = getPhoto(photoId)
tags = getTags(photoXml)
ecoregions = ecoregionsInTags(tags)
if (len(ecoregions) == 0):
badPhotos.append(photoId)
else:
[eCodes.add(c) for c in getECodes(ecoregions)]
f = open('eotw.txt', 'w')
for e in eCodes:
f.write(e + '\n')
f.close()
f = open('badPhotos.html', 'w')
for b in badPhotos:
url = getUrl(getPhoto(b))
f.write("<a href =\"" + url + "\">" + url + "</a><br/>" + '\n')
f.close()