-
-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scraping code for Share Food Program
- Loading branch information
Showing
5 changed files
with
245 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Data Utilities | ||
|
||
This directory contains a variety of utilities, code, and information related to our data operatons on PHLask. | ||
|
||
Check out the individual folders for more information. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Share Food Program Scraping | ||
|
||
The Share Food Program can be found here: https://www.sharefoodprogram.org/ | ||
|
||
The site contains regularly-updated information about food resources in the Philadelphia area. This directory contains Python code for scraping this site. | ||
|
||
## Setup | ||
|
||
### Install Python | ||
|
||
First, make sure to have Python 3.12+ installed. We also recommend using [PyCharm](https://www.jetbrains.com/pycharm/download) for Python development. | ||
|
||
### Create a Virtual Environment and Install Dependencies | ||
|
||
Inside of this directory, run the following commands: | ||
|
||
```bash | ||
python -m venv .venv | ||
# If on Mac/Linux | ||
source .venv/bin/activate | ||
# If on Windows | ||
.venv\Scripts\activate | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### Add Firebase Credentials | ||
|
||
To run the scraper and upload the data to Firebase, you will need to add your Firebase credentials to this folder. Message us in the #phlask_data channel on Slack to get access. | ||
|
||
### Run the Scraper | ||
|
||
To run the scraper, use the following command, making sure to set the URL below to the correct URL for your Firebase instance. | ||
|
||
```bash | ||
python scrape_share_food_program.py https://phlask-share-food-test.firebaseio.com/ | ||
``` | ||
|
||
You should see output like the following: | ||
|
||
``` | ||
Got 169 new resources from the scraped resource | ||
Using DB URL: https://phlask-share-food-test.firebaseio.com/ | ||
Loaded PHLASK DB reference with 819 resources | ||
Removed 169 existing scraped resources from the DB | ||
We now have 819 total resources in the DB | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
asttokens==3.0.0 | ||
attrs==24.3.0 | ||
backcall==0.2.0 | ||
beautifulsoup4==4.12.3 | ||
bleach==6.2.0 | ||
CacheControl==0.14.1 | ||
cachetools==5.5.0 | ||
certifi==2024.8.30 | ||
cffi==1.17.1 | ||
charset-normalizer==3.4.0 | ||
colorama==0.4.6 | ||
cryptography==43.0.3 | ||
decorator==5.1.1 | ||
defusedxml==0.7.1 | ||
docopt==0.6.2 | ||
executing==2.1.0 | ||
fastjsonschema==2.21.1 | ||
firebase-admin==6.6.0 | ||
google-api-core==2.23.0 | ||
google-api-python-client==2.154.0 | ||
google-auth==2.36.0 | ||
google-auth-httplib2==0.2.0 | ||
google-cloud-core==2.4.1 | ||
google-cloud-firestore==2.19.0 | ||
google-cloud-storage==2.18.2 | ||
google-crc32c==1.6.0 | ||
google-resumable-media==2.7.2 | ||
googleapis-common-protos==1.66.0 | ||
grpcio==1.68.0 | ||
grpcio-status==1.68.0 | ||
httplib2==0.22.0 | ||
idna==3.10 | ||
ipython==8.12.3 | ||
jedi==0.19.2 | ||
Jinja2==3.1.5 | ||
jsonschema==4.23.0 | ||
jsonschema-specifications==2024.10.1 | ||
jupyter_client==8.6.3 | ||
jupyter_core==5.7.2 | ||
jupyterlab_pygments==0.3.0 | ||
MarkupSafe==3.0.2 | ||
matplotlib-inline==0.1.7 | ||
mistune==3.1.0 | ||
msgpack==1.1.0 | ||
nbclient==0.10.2 | ||
nbconvert==7.16.5 | ||
nbformat==5.10.4 | ||
packaging==24.2 | ||
pandocfilters==1.5.1 | ||
parso==0.8.4 | ||
pickleshare==0.7.5 | ||
pipreqs==0.5.0 | ||
platformdirs==4.3.6 | ||
prompt_toolkit==3.0.48 | ||
proto-plus==1.25.0 | ||
protobuf==5.28.3 | ||
pure_eval==0.2.3 | ||
pyasn1==0.6.1 | ||
pyasn1_modules==0.4.1 | ||
pycparser==2.22 | ||
Pygments==2.19.1 | ||
PyJWT==2.10.0 | ||
pyparsing==3.2.0 | ||
python-dateutil==2.9.0.post0 | ||
pywin32==308 | ||
pyzmq==26.2.0 | ||
referencing==0.35.1 | ||
requests==2.32.3 | ||
rpds-py==0.22.3 | ||
rsa==4.9 | ||
six==1.17.0 | ||
soupsieve==2.6 | ||
stack-data==0.6.3 | ||
tinycss2==1.4.0 | ||
tornado==6.4.2 | ||
traitlets==5.14.3 | ||
uritemplate==4.1.1 | ||
urllib3==2.2.3 | ||
wcwidth==0.2.13 | ||
webencodings==0.5.1 | ||
yarg==0.1.9 |
106 changes: 106 additions & 0 deletions
106
data/scrape-share-food-program/scrape_share_food_program.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import requests | ||
import datetime | ||
from bs4 import BeautifulSoup | ||
import os | ||
import sys | ||
import uuid | ||
|
||
|
||
def extract_street_from_address(address): | ||
return address.split(', ')[0] | ||
|
||
|
||
def convert_html_to_text(html): | ||
# Take the HTML and convert it into a plaintext string | ||
soup = BeautifulSoup(html, "html.parser") | ||
for elem in soup.find_all(["a", "p", "div", "h3", "br"]): | ||
elem.replace_with(elem.text + "\n") | ||
plain_text = soup.get_text(separator="\n") | ||
return plain_text | ||
|
||
|
||
url = "https://www.sharefoodprogram.org/wp-json/wpgmza/v1/features/base64eJyrVkrLzClJLVKyUqqOUcpNLIjPTIlRsopRMo5R0gEJFGeUFni6FAPFomOBAsmlxSX5uW6ZqTkpELFapVoABaMWvA" | ||
|
||
response = requests.get(url, headers={ | ||
'User-Agent': 'PostmanRuntime/7.43.0' | ||
}) | ||
|
||
data = response.json() | ||
|
||
new_phlask_data = {} | ||
|
||
for marker in data['markers']: | ||
|
||
if marker['approved'] != "1": | ||
continue | ||
|
||
current_timestamp = datetime.datetime.now().isoformat() | ||
|
||
new_phlask_resource = { | ||
"address": extract_street_from_address(marker['address']), | ||
"city": "Philadelphia", | ||
"creator": "phlask", | ||
"date_created": current_timestamp, | ||
"description": convert_html_to_text(marker['description']), | ||
"entry_type": "UNSURE", | ||
"last_modified": current_timestamp, | ||
"last_modifier": "phlask", | ||
"latitude": float(marker['lat']), | ||
"longitude": float(marker['lng']), | ||
"name": marker['title'], | ||
"resource_type": "FOOD", | ||
"source": { | ||
"type": "WEB_SCRAPE", | ||
"url": url, | ||
"logo_url": "https://www.sharefoodprogram.org/wp-content/themes/sharefood-theme/images/svg/share-food-program-logo.svg" | ||
}, | ||
"state": "PA", | ||
"status": "OPERATIONAL", | ||
"verification": { | ||
"last_modified": current_timestamp, | ||
"last_modifier": "phlask", | ||
"verified": True | ||
}, | ||
"version": 1, | ||
"food": { | ||
"food_type": [], | ||
"distribution_type": [], | ||
"organization_type": [] | ||
}, | ||
"zip_code": "19104" # TODO: Change to zip code from lookup using geocoding | ||
} | ||
|
||
new_phlask_data[str(uuid.uuid4())] = new_phlask_resource | ||
|
||
print(f"Got {len(new_phlask_data)} new resources from the scraped resource") | ||
|
||
cert_path = os.path.abspath("firebase_cert.json") | ||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cert_path | ||
|
||
# Grab the DB URL from the python run command | ||
DB_URL = sys.argv[1] | ||
print(f"Using DB URL: {DB_URL}") | ||
if not DB_URL: | ||
raise ValueError("DB_URL not set, make sure to append this after your python run command") | ||
|
||
from firebase_admin import initialize_app, db | ||
default_app = initialize_app() | ||
ref = db.reference(url=DB_URL) | ||
existing_phlask_data: dict = ref.get() | ||
print(f"Loaded PHLASK DB reference with {len(existing_phlask_data)} resources") | ||
|
||
# Remove the existing resources from this scraped data so we don't have duplicates | ||
before_len = len(existing_phlask_data) | ||
existing_phlask_data = {resource_name: resource for resource_name, resource in existing_phlask_data.items() if resource['source'].get('url') != url} | ||
after_len = len(existing_phlask_data) | ||
print(f"Removed {before_len - after_len} existing scraped resources from the DB") | ||
|
||
# Add the new resources | ||
existing_phlask_data.update(new_phlask_data) | ||
|
||
# Set the new data in firebase | ||
ref.set(existing_phlask_data) | ||
|
||
# Verify that the new data was pushed | ||
new_data_test = ref.get() | ||
print(f"We now have {len(new_data_test)} total resources in the DB") |