diff --git a/jedeschule/spiders/berlin.py b/jedeschule/spiders/berlin.py index 66c337f..a28974a 100644 --- a/jedeschule/spiders/berlin.py +++ b/jedeschule/spiders/berlin.py @@ -15,10 +15,11 @@ class BerlinSpider(scrapy.Spider): # 502 with user agent = default (scrapy) -> use a real user agent like "jedeschule" # 429 with download delay = default -> set download delay to slow down scrapy # custom settings avoid other spiders from being affected of solving a spider individual problem - custom_settings = {'USER_AGENT': 'jedeschule (open data project)', 'DOWNLOAD_DELAY': 1,} - base_url = 'https://www.berlin.de/sen/bildung/schule/berliner-schulen/schulverzeichnis/' + custom_settings = {'USER_AGENT': 'jedeschule (open data project)', 'DOWNLOAD_DELAY': 1 } + base_url = 'https://www.bildung.berlin.de/Schulverzeichnis/' start_url = base_url + 'SchulListe.aspx' start_urls = [start_url] + url_parse_staff = base_url + 'schulpersonal.aspx?view=pers' def parse(self, response): schools = response.css('td a::attr(href)').extract() @@ -28,11 +29,13 @@ def parse(self, response): def parse_detail(self, response): meta = {} - name = response.css('#ContentPlaceHolderMenuListe_lblSchulname::text').extract_first().strip()#.rsplit('-', 1) + name = response.css( + '#ContentPlaceHolderMenuListe_lblSchulname::text').extract_first().strip() # .rsplit('-', 1) meta['name'] = self.fix_data(name) meta['id'] = self._parse_school_no(response.url) meta['address'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblStrasse::text').extract_first()) - meta['zip'], meta['city'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblOrt::text').extract_first()).split(" ", 1) + meta['zip'], meta['city'] = self.fix_data( + response.css('#ContentPlaceHolderMenuListe_lblOrt::text').extract_first()).split(" ", 1) schooltype = re.split('[()]', response.css('#ContentPlaceHolderMenuListe_lblSchulart::text').extract_first()) meta['schooltype'] = self.fix_data(schooltype[0].strip()) meta['legal_status'] = self.fix_data(schooltype[1].strip()) @@ -42,7 +45,7 @@ def parse_detail(self, response): meta['web'] = self.fix_data(response.css('#ContentPlaceHolderMenuListe_HLinkWeb::attr(href)').extract_first()) headmaster = response.css('#ContentPlaceHolderMenuListe_lblLeitung::text').extract_first() if headmaster: - meta['headmaster'] = self.fix_data(' '.join(headmaster.split(',')[::-1]).strip()) + meta['headmaster'] = self.fix_data(' '.join(headmaster.split(',')[::-1]).strip()) meta['cookiejar'] = response.meta['cookiejar'] meta['data_url'] = response.url activities = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblAGs::text').extract_first()) @@ -51,7 +54,81 @@ def parse_detail(self, response): partner = self.fix_data(response.css('#ContentPlaceHolderMenuListe_lblPartner::text').extract_first()) if partner: meta['partner'] = [x.strip() for x in partner.split(';')] - yield meta + yield scrapy.Request(self.base_url + 'schuelerschaft.aspx?view=jgs', callback=self.parse_students, meta=meta, + dont_filter=True) + + def parse_students(self, response): + # inspect_response(response, self) + years = response.css('#portrait_hauptnavi li a::attr(href)').extract() + relevant = [] + for i, year in enumerate(years): + if (re.search('.*view=jgs&jahr.*', year)): + relevant.append(year) + meta = response.meta + if (len(relevant) > 0): + meta['student_years'] = relevant[1:] + yield scrapy.Request(self.base_url + relevant[0], callback=self.parse_student_year, meta=meta, + dont_filter=True) + else: + yield scrapy.Request(self.url_parse_staff, callback=self.parse_staff, meta=meta, dont_filter=True) + + def parse_student_year(self, response): + # inspect_response(response, self) + meta = response.meta + if (len(meta['student_years']) > 0): + headers = response.css('th::text').extract() + rows = response.css('table tr.odd, table tr.even') + title = self.fix_data(response.css('table caption::text').extract_first()).replace('Jahrgangsstufen', + '').strip() + if not 'students' in meta.keys(): + meta['students'] = [] + for i, row in enumerate(rows): + result = {} + entries = row.css('td::text').extract() + for j, header in enumerate(headers): + result[header] = entries[j] + result['year'] = title + meta['students'].append(result) + relevant = meta['student_years'] + meta['student_years'] = relevant[1:] + yield scrapy.Request(self.base_url + relevant[0], callback=self.parse_student_year, meta=meta, + dont_filter=True) + else: + yield scrapy.Request(self.url_parse_staff, callback=self.parse_staff, meta=meta, dont_filter=True) + + def parse_staff(self, response): + years = response.css('#NaviSchulpersonal ul')[0].css('li a[href*="jahr"]::attr(href)').extract() + meta = response.meta + if (len(years) > 0): + meta['staff_years'] = years[1:] + yield scrapy.Request(self.base_url + years[0], callback=self.parse_staff_year, meta=meta, dont_filter=True) + else: + yield response.meta + + def parse_staff_year(self, response): + meta = response.meta + headers = response.css('th::text').extract() + rows = response.css('table tr.odd, table tr.even') + title_raw = self.fix_data(response.css('table caption::text').extract_first()) + title = '' + if title_raw == None: + title = title_raw.replace('Jahrgangsstufen','').strip() + if not 'staff' in meta.keys(): + meta['staff'] = [] + for i, row in enumerate(rows): + result = {} + entries = row.css('td::text').extract() + for j, header in enumerate(headers): + result[header] = entries[j] + result['year'] = title + meta['staff'].append(result) + relevant = meta['staff_years'] + if (len(relevant) > 0): + meta['staff_years'] = relevant[1:] + yield scrapy.Request(self.base_url + relevant[0], callback=self.parse_staff_year, meta=meta, + dont_filter=True) + else: + yield meta def _parse_school_no(self, url): """Parses the school number from the 'IDSchulzweig' parameter in the url""" @@ -67,6 +144,7 @@ def fix_data(self, string): string = ' '.join(string.split()) string.replace('\n', '') string.replace('\t', '') + string.replace('\r', '') return string def normalize(self, item: Item) -> School: @@ -81,4 +159,4 @@ def normalize(self, item: Item) -> School: fax=item.get('fax'), phone=item.get('telephone'), director=item.get('headmaster'), - legal_status=item.get('legal_status')) \ No newline at end of file + legal_status=item.get('legal_status'))