Skip to content

use day before yesterday as cutoff #33

use day before yesterday as cutoff

use day before yesterday as cutoff #33

Workflow file for this run

name: Crawl data and save artifact
on:
push:
branches: [ main, github-actions]
schedule:
- cron: '0 0 * * *' # Runs every day at midnight (UTC)
env:
DB_FILENAME: "production.db"
CREATE_DB: true
DATA_PATH: './data'
CURRENT_SEASON: 2023
jobs:
setup:
runs-on: ubuntu-latest
services:
selenium:
image: selenium/standalone-firefox
env:
SE_NODE_MAX_SESSIONS: 5
options: >-
--shm-size="2g"
ports:
- 4444:4444
- 7900:7900
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 1
clean: true
- name: Download database
id: download_artifact
uses: actions/download-artifact@v4
with:
name: $DB_FILENAME
path: $DB_FILENAME
- name: Create DB
if: ${{ failure() && steps.download_artifact.conclusion == 'failure' }}
run: |
pip install -r requirements.txt
python scripts/create_database.py --filename $DB_FILENAME
- name: Run Selenium crawler script
id: crawler
if: ${{ always() && hashFiles(env.DB_FILENAME) != '' }}
run: |
pip install -r requirements.txt
mkdir -p data
python scripts/crawl_concurrently.py \
--path $DATA_PATH \
--season $CURRENT_SEASON \
--server http://localhost:4444 \
--date $(date -d "2 days ago" +"%Y-%m-%d")
python scripts/insert_data.py -db $DB_FILENAME --data $(ls $DATA_PATH/$CURRENT_SEASON/*.json)
- name: Upload database as artifact
if: ${{ always() && steps.crawler.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.DB_FILENAME }}
path: ${{ env.DB_FILENAME }}
overwrite: true