Skip to content

Commit

Permalink
feat: auto update database
Browse files Browse the repository at this point in the history
  • Loading branch information
TimeTrapzz committed Oct 25, 2024
1 parent 6093594 commit d764894
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 11 deletions.
52 changes: 52 additions & 0 deletions .github/workflows/update_database.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: DBLP Database Update

on:
schedule:
- cron: '0 0 * * 0' # 每周日运行
workflow_dispatch: # 允许手动触发

jobs:
update-dblp:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

# 1. 设置 Python 环境
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.12'

# 2. 安装依赖
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install lxml tqdm
- name: Setup PostgreSQL
uses: tj-actions/install-postgresql@v3
with:
postgresql-version: 16

# 3. 下载和处理 DBLP 数据
- name: Download and process DBLP
run: |
wget https://dblp.org/xml/dblp.dtd
wget https://dblp.org/xml/dblp.xml.gz
wget https://dblp.org/xml/dblp.xml.gz.md5
# 4. 转换为 SQL
- name: Convert to SQL
run: |
python scripts/convert.py --dtd_file dblp.dtd --xml_file dblp.xml.gz --md5_file dblp.xml.gz.md5 --output_sql_file dblp.sql
# 5. 导入到 PostgreSQL
- name: Import to PostgreSQL
run: |
psql -h ${{ secrets.DB_HOST }} -p 5432 -U ${{ secrets.DB_USER }} -d ${{ secrets.DB_NAME }} -f dblp.sql
# 6. 清理临时文件
- name: Cleanup
run: |
rm dblp.xml dblp.dtd dblp.sql
129 changes: 129 additions & 0 deletions scripts/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import gzip
from lxml import etree
from tqdm import tqdm
import logging
import argparse
import hashlib


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')


def read_xml(dtd_file, xml_file, md5_file):
with open(md5_file, 'r') as f:
md5_content = f.read().strip().split(' ')[0]
with open(xml_file, 'rb') as f:
file_md5 = hashlib.md5(f.read()).hexdigest()
logger.info(f"文件 MD5: {file_md5}")
logger.info(f"实际 MD5: {md5_content}")
if file_md5 != md5_content:
raise Exception("MD5 校验失败")

dtd = etree.DTD(file=dtd_file)

with gzip.open(xml_file, 'rb') as f:
parser = etree.XMLParser(dtd_validation=True)
tree = etree.parse(f, parser)
root = tree.getroot()

# 验证XML是否符合DTD
if not dtd.validate(tree):
raise Exception("XML 文件不符合 DTD 规范")

return root


def process_title(title_elem):
if title_elem is None:
return ""
# 处理可能包含的HTML标签
title_text = ''.join(title_elem.itertext()).strip()
# 只保留字母数字字符,并转小写
title_text = ''.join(char.lower()
for char in title_text if char.isalnum() and char.isascii())
return title_text


def parse_xml(root):
parsed_data = []
for elem in tqdm(root, desc="解析 XML 数据"):
if elem.tag in ['article', 'inproceedings', 'proceedings', 'book', 'incollection',
'phdthesis', 'mastersthesis', 'www', 'person', 'data']:
try:
url = elem.get('key')
entry_type = elem.tag

title_elem = elem.find('title')
title_text = process_title(title_elem)

parsed_data.append((url, title_text, entry_type))
except Exception as e:
logger.error(f"处理条目时出错: {e}")
logger.error(etree.tostring(
elem, encoding='unicode', pretty_print=True))

logger.info(f"解析完成,共处理了 {len(parsed_data)} 条数据")

return parsed_data


def create_database_sql(parsed_data, sql_file):
with open(sql_file, 'w') as f:
# 创建表时添加全文索引
f.write("""
CREATE TABLE IF NOT EXISTS dblp_entries_tmp (
id SERIAL PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
type TEXT
);
CREATE INDEX IF NOT EXISTS dblp_entries_title_idx_tmp ON dblp_entries_tmp USING GIN(to_tsvector('english', title));
""")


# 分批插入数据
batch_size = 3000
total_batches = len(parsed_data) // batch_size + \
(1 if len(parsed_data) % batch_size != 0 else 0)

for batch in tqdm(range(total_batches), desc="插入数据"):
start_idx = batch * batch_size
end_idx = min((batch + 1) * batch_size, len(parsed_data))
batch_data = parsed_data[start_idx:end_idx]

f.write("INSERT INTO dblp_entries_tmp (url, title, type) VALUES\n")
for i, (url, title, entry_type) in enumerate(batch_data):
f.write(f"('{url}', '{title}', '{entry_type}')")
if i < len(batch_data) - 1:
f.write(",\n")
else:
f.write(";\n")

if batch < total_batches - 1:
f.write("\n")

f.write("""
DROP TABLE IF EXISTS dblp_entries;
ALTER TABLE dblp_entries_tmp RENAME TO dblp_entries;
ALTER INDEX dblp_entries_title_idx_tmp RENAME TO dblp_entries_title_idx;
""")


if __name__ == "__main__":
args_parser = argparse.ArgumentParser(
description="Convert Dblp XML to JSON")
args_parser.add_argument("--dtd_file", type=str,
default="dblp.dtd", help="DTD file name")
args_parser.add_argument("--xml_file", type=str,
default="dblp.xml.gz", help="XML file name")
args_parser.add_argument("--md5_file", type=str,
default="dblp.xml.gz.md5", help="MD5 file name")
args_parser.add_argument(
"--output_sql_file", type=str, default="dblp.sql", help="Output SQL file name")
args = args_parser.parse_args()

root = read_xml(args.dtd_file, args.xml_file, args.md5_file)
parsed_data = parse_xml(root)
create_database_sql(parsed_data, args.output_sql_file)
13 changes: 6 additions & 7 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,7 @@ import postgres from "postgres";

export default {
async fetch(request: any, env: any, ctx: any): Promise<Response> {
const sql = postgres({
username: env.DB_USERNAME,
password: env.DB_PASSWORD,
host: env.DB_HOST,
port: env.DB_PORT,
database: env.DB_NAME,
const sql = postgres(env.DB_URL, {
ssl: {
rejectUnauthorized: true
}
Expand Down Expand Up @@ -49,7 +44,7 @@ async function handleGet(request: any, sql: any): Promise<Response> {
headers: { "Content-Type": "application/json" }
});
}
const searchQuery = query.split(' ').map((word: any) => `${word}:*`).join(' & ');
const searchQuery = query.trim() == '' ? 'qaqzzz:*' : `${query}:*`;
const result = await sql`
SELECT url, title,
ts_rank_cd(to_tsvector('english', title), to_tsquery('english', ${searchQuery})) AS rank
Expand Down Expand Up @@ -87,6 +82,10 @@ async function handlePost(request: any, sql: any): Promise<Response> {
}

const searchQueries = queries.map(query => {
// 如果query是空字符串或只有空格
if (query.trim() == '') {
return 'qaqzzz:*';
}
return `${query}:*`;
});

Expand Down
4 changes: 0 additions & 4 deletions wrangler.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ enabled = true
# Note: Use secrets to store sensitive data.
# - https://developers.cloudflare.com/workers/configuration/secrets/
[vars]
DB_USERNAME = "timetrap"
DB_HOST = "dblp.postgres.database.azure.com"
DB_PORT = 5432
DB_NAME = "postgres"

# Bind the Workers AI model catalog. Run machine learning models, powered by serverless GPUs, on Cloudflare’s global network
# Docs: https://developers.cloudflare.com/workers/wrangler/configuration/#workers-ai
Expand Down

0 comments on commit d764894

Please sign in to comment.