Scraping code for Share Food Program

phlask · Jan 16, 2025 · 57151e2 · 57151e2
1 parent 72fd5be
commit 57151e2
Show file tree

Hide file tree

Showing 5 changed files with 245 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -31,4 +31,10 @@ resources/
 .env
 
 # testing
-newrelic_agent.log
+newrelic_agent.log
+
+# Firebase Certs
+firebase_cert.json
+
+# Python Virtual Environments
+.venv/
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,5 @@
+# Data Utilities
+
+This directory contains a variety of utilities, code, and information related to our data operatons on PHLask.
+
+Check out the individual folders for more information.
diff --git a/data/scrape-share-food-program/README.md b/data/scrape-share-food-program/README.md
@@ -0,0 +1,46 @@
+# Share Food Program Scraping
+
+The Share Food Program can be found here: https://www.sharefoodprogram.org/
+
+The site contains regularly-updated information about food resources in the Philadelphia area. This directory contains Python code for scraping this site.
+
+## Setup
+
+### Install Python
+
+First, make sure to have Python 3.12+ installed. We also recommend using [PyCharm](https://www.jetbrains.com/pycharm/download) for Python development.
+
+### Create a Virtual Environment and Install Dependencies
+
+Inside of this directory, run the following commands:
+
+```bash
+python -m venv .venv
+# If on Mac/Linux
+source .venv/bin/activate
+# If on Windows
+.venv\Scripts\activate
+pip install -r requirements.txt
+```
+
+### Add Firebase Credentials
+
+To run the scraper and upload the data to Firebase, you will need to add your Firebase credentials to this folder. Message us in the #phlask_data channel on Slack to get access.
+
+### Run the Scraper
+
+To run the scraper, use the following command, making sure to set the URL below to the correct URL for your Firebase instance.
+
+```bash
+python scrape_share_food_program.py https://phlask-share-food-test.firebaseio.com/
+```
+
+You should see output like the following:
+
+```
+Got 169 new resources from the scraped resource
+Using DB URL: https://phlask-share-food-test.firebaseio.com/
+Loaded PHLASK DB reference with 819 resources
+Removed 169 existing scraped resources from the DB
+We now have 819 total resources in the DB
+```
diff --git a/data/scrape-share-food-program/requirements.txt b/data/scrape-share-food-program/requirements.txt
@@ -0,0 +1,81 @@
+asttokens==3.0.0
+attrs==24.3.0
+backcall==0.2.0
+beautifulsoup4==4.12.3
+bleach==6.2.0
+CacheControl==0.14.1
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+colorama==0.4.6
+cryptography==43.0.3
+decorator==5.1.1
+defusedxml==0.7.1
+docopt==0.6.2
+executing==2.1.0
+fastjsonschema==2.21.1
+firebase-admin==6.6.0
+google-api-core==2.23.0
+google-api-python-client==2.154.0
+google-auth==2.36.0
+google-auth-httplib2==0.2.0
+google-cloud-core==2.4.1
+google-cloud-firestore==2.19.0
+google-cloud-storage==2.18.2
+google-crc32c==1.6.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.66.0
+grpcio==1.68.0
+grpcio-status==1.68.0
+httplib2==0.22.0
+idna==3.10
+ipython==8.12.3
+jedi==0.19.2
+Jinja2==3.1.5
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyterlab_pygments==0.3.0
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mistune==3.1.0
+msgpack==1.1.0
+nbclient==0.10.2
+nbconvert==7.16.5
+nbformat==5.10.4
+packaging==24.2
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pipreqs==0.5.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+proto-plus==1.25.0
+protobuf==5.28.3
+pure_eval==0.2.3
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+Pygments==2.19.1
+PyJWT==2.10.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+pywin32==308
+pyzmq==26.2.0
+referencing==0.35.1
+requests==2.32.3
+rpds-py==0.22.3
+rsa==4.9
+six==1.17.0
+soupsieve==2.6
+stack-data==0.6.3
+tinycss2==1.4.0
+tornado==6.4.2
+traitlets==5.14.3
+uritemplate==4.1.1
+urllib3==2.2.3
+wcwidth==0.2.13
+webencodings==0.5.1
+yarg==0.1.9
diff --git a/data/scrape-share-food-program/scrape_share_food_program.py b/data/scrape-share-food-program/scrape_share_food_program.py
@@ -0,0 +1,106 @@
+import requests
+import datetime
+from bs4 import BeautifulSoup
+import os
+import sys
+import uuid
+
+
+def extract_street_from_address(address):
+    return address.split(', ')[0]
+
+
+def convert_html_to_text(html):
+    # Take the HTML and convert it into a plaintext string
+    soup = BeautifulSoup(html, "html.parser")
+    for elem in soup.find_all(["a", "p", "div", "h3", "br"]):
+        elem.replace_with(elem.text + "\n")
+    plain_text = soup.get_text(separator="\n")
+    return plain_text
+
+
+url = "https://www.sharefoodprogram.org/wp-json/wpgmza/v1/features/base64eJyrVkrLzClJLVKyUqqOUcpNLIjPTIlRsopRMo5R0gEJFGeUFni6FAPFomOBAsmlxSX5uW6ZqTkpELFapVoABaMWvA"
+
+response = requests.get(url, headers={
+    'User-Agent': 'PostmanRuntime/7.43.0'
+})
+
+data = response.json()
+
+new_phlask_data = {}
+
+for marker in data['markers']:
+
+    if marker['approved'] != "1":
+        continue
+
+    current_timestamp = datetime.datetime.now().isoformat()
+
+    new_phlask_resource = {
+        "address": extract_street_from_address(marker['address']),
+        "city": "Philadelphia",
+        "creator": "phlask",
+        "date_created": current_timestamp,
+        "description": convert_html_to_text(marker['description']),
+        "entry_type": "UNSURE",
+        "last_modified": current_timestamp,
+        "last_modifier": "phlask",
+        "latitude": float(marker['lat']),
+        "longitude": float(marker['lng']),
+        "name": marker['title'],
+        "resource_type": "FOOD",
+        "source": {
+            "type": "WEB_SCRAPE",
+            "url": url,
+            "logo_url": "https://www.sharefoodprogram.org/wp-content/themes/sharefood-theme/images/svg/share-food-program-logo.svg"
+        },
+        "state": "PA",
+        "status": "OPERATIONAL",
+        "verification": {
+            "last_modified": current_timestamp,
+            "last_modifier": "phlask",
+            "verified": True
+        },
+        "version": 1,
+        "food": {
+            "food_type": [],
+            "distribution_type": [],
+            "organization_type": []
+        },
+        "zip_code": "19104"  # TODO: Change to zip code from lookup using geocoding
+    }
+
+    new_phlask_data[str(uuid.uuid4())] = new_phlask_resource
+
+print(f"Got {len(new_phlask_data)} new resources from the scraped resource")
+
+cert_path = os.path.abspath("firebase_cert.json")
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cert_path
+
+# Grab the DB URL from the python run command
+DB_URL = sys.argv[1]
+print(f"Using DB URL: {DB_URL}")
+if not DB_URL:
+    raise ValueError("DB_URL not set, make sure to append this after your python run command")
+
+from firebase_admin import initialize_app, db
+default_app = initialize_app()
+ref = db.reference(url=DB_URL)
+existing_phlask_data: dict = ref.get()
+print(f"Loaded PHLASK DB reference with {len(existing_phlask_data)} resources")
+
+# Remove the existing resources from this scraped data so we don't have duplicates
+before_len = len(existing_phlask_data)
+existing_phlask_data = {resource_name: resource for resource_name, resource in existing_phlask_data.items() if resource['source'].get('url') != url}
+after_len = len(existing_phlask_data)
+print(f"Removed {before_len - after_len} existing scraped resources from the DB")
+
+# Add the new resources
+existing_phlask_data.update(new_phlask_data)
+
+# Set the new data in firebase
+ref.set(existing_phlask_data)
+
+# Verify that the new data was pushed
+new_data_test = ref.get()
+print(f"We now have {len(new_data_test)} total resources in the DB")