From 1121fbc2a0541fbf88529c4ec95f711d687ba92a Mon Sep 17 00:00:00 2001 From: wj23027 <809711241@qq.com> Date: Wed, 4 Dec 2024 20:02:14 +0800 Subject: [PATCH 1/2] feat:developer workflow --- .github/workflows/updateDeveloperData.yml | 52 ++++ dashboard/developer/script/get_data.py | 233 ++++++++++++++++++ .../developer/script/get_data_github_api.py | 145 +++++++++++ dashboard/developer/script/update_data.py | 154 ++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 .github/workflows/updateDeveloperData.yml create mode 100644 dashboard/developer/script/get_data.py create mode 100644 dashboard/developer/script/get_data_github_api.py create mode 100644 dashboard/developer/script/update_data.py diff --git a/.github/workflows/updateDeveloperData.yml b/.github/workflows/updateDeveloperData.yml new file mode 100644 index 0000000..5c786dc --- /dev/null +++ b/.github/workflows/updateDeveloperData.yml @@ -0,0 +1,52 @@ +name: Monthly Python Script Execution + +on: + schedule: + # 每个月2号的00:00 UTC运行脚本 + - cron: '0 0 2 * *' + workflow_dispatch: # 也允许手动触发工作流 + +jobs: + run_python_script: + runs-on: ubuntu-latest # 使用 GitHub 提供的 Ubuntu 环境 + + steps: + - name: Checkout code + uses: actions/checkout@v3 # 检出仓库代码 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' # 设置 Python 版本 + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt # 安装 Python 依赖 + + - name: Start ClickHouse service + run: | + docker run -d --name clickhouse-server --ulimit nofile=262144:262144 -p 9000:9000 clickhouse/clickhouse-server + sleep 30 + + - name: List running containers + run: docker ps -a + + - name: Check ClickHouse logs + run: docker logs clickhouse-server + + - name: Test ClickHouse query + run: | + docker exec -i clickhouse-server clickhouse-client --query "SELECT 1" + + - name: Run the Python script + run: | + python dashboard/developer/scripts/update_data.py + + env: + xlabDB_HOST: ${{ secrets.DB_HOST }} + xlabDB_USER: ${{ secrets.DB_USER }} + xlabDB_PASSWORD: ${{ secrets.DB_PASSWORD }} + dashboardDB_HOST: ${{ secrets.DASHBOARD_DB_HOST }} + dashboardDB_USER: ${{ secrets.DASHBOARD_DB_USER }} + dashboardDB_PASSWORD: ${{ secrets.DASHBOARD_DB_PASSWORD }} diff --git a/dashboard/developer/script/get_data.py b/dashboard/developer/script/get_data.py new file mode 100644 index 0000000..bbdf0f1 --- /dev/null +++ b/dashboard/developer/script/get_data.py @@ -0,0 +1,233 @@ +import pandas as pd +from clickhouse_driver import Client +from datetime import datetime, timedelta +import requests +import time +import os + +xlabDB_host = os.getenv('DB_HOST') +xlabDB_user = os.getenv('DB_USER') +xlabDB_password = os.getenv('DB_PASSWORD') + +dashboard_host = os.getenv('DASHBOARDS_DB_HOST') +dashboard_user = os.getenv('DASHBOARDS_DB_USER') +# 访问 clickhouse 数据库 +source_client = Client( + host=xlabDB_host, + port=9000, # ClickHouse 默认端口为 9000 + user= xlabDB_user, + password= xlabDB_password, + database='opensource' # 数据库名称 +) + +target_client = Client( + host=dashboard_host, + port=9000, + user=dashboard_user, + database='opensource', +) + +# 获取上个月的月份 +def get_last_month(): + # 获取当前日期 + today = datetime.today() + # 计算上个月 + first_day_of_current_month = today.replace(day=1) + last_month = first_day_of_current_month - timedelta(days=1) + return last_month.strftime("%Y%m") + + + +def query_clickhouse(user_list,query_template, columns,client = source_client, batch_size=150): + + result = [] + + # 分批处理用户列表 + for i in range(0, len(user_list), batch_size): + batch = user_list[i:i + batch_size] + user_batch_list = "', '".join(batch) + + # 创建查询语句 + query = query_template.format(user_batch_list=user_batch_list) + + # 执行查询并获取结果 + result_batch = client.execute(query) + + result.extend(result_batch) + + print(f'已查询 {i + batch_size}/{len(user_list)} 位用户数据') + + return pd.DataFrame(result, columns=columns) + +def save_to_clickhouse(table_name, data,client=target_client): + + columns = data.columns.tolist() + + # 确保日期字段为字符串 + if 'created_at' in columns: + data['created_at'] = data['created_at'].astype(str) + + # 清空表中的数据 + client.execute(f"TRUNCATE TABLE {table_name}") + print(f"表 {table_name} 的数据已被清空") + + # 显式将 NaN 转换为 None(NULL) + data = data.applymap(lambda x: None if pd.isna(x) else x) + + # 转换为记录字典列表并插入 + records = data.to_dict('records') + + try: + # 执行插入操作 + client.execute(f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES", records) + print(f"数据已成功保存到表 {table_name}") + except Exception as e: + print(f"插入失败: {e}") + print(f"错误的 SQL 查询: INSERT INTO {table_name} ({', '.join(columns)}) VALUES") + + +# 从 从openleaderboard api 获取数据 +# 定义函数,用于从 openleaderboard oss api 获取数据 +def fetch_openleaderboard_data(region, metric, month = None): + # 如果没有传入月份,使用上个月的月份 + if month is None: + month = get_last_month() + + # 构建 URL,metric 是替换的指标 + url = f"https://oss.x-lab.info/open_leaderboard/{metric}/actor/{region}/{month}.json" + print("请求的URL:", url) + response = requests.get(url) + + # 检查请求是否成功 + if response.status_code == 200: + data = response.json().get("data", [])[:300] # 获取前300项 + # 提取actor_login, value + return [ + { + "actor_login": item["item"]["name"], + "value": item["value"] # 提取value字段 + } for item in data + ] + else: + print("请求失败,状态码:", response.status_code, response.text) + return [] + +def get_openleaderboard_user_list(): + + # 定义指标变量 + metrics = ["activity", "open_rank"] + + # 定义地区变量 + regions = ["chinese", "global"] + + # 存储所有结果的 DataFrame + combined_df = pd.DataFrame() + + # 获取每个指标的每个地区的数据 + for metric in metrics: + for region in regions: + # 获取数据 + data = fetch_openleaderboard_data(region, metric) + + if data: # 确保数据不为空 + # 将数据转换为DataFrame + df = pd.DataFrame(data) + + # 动态命名value列 + value_column_name = f"{metric}_{region}_value" + df[value_column_name] = df["value"].astype(str) + df = df.drop(columns=["value"]) + + # 按actor_login列合并 + if combined_df.empty: + combined_df = df + else: + # 使用'actor_login'列进行合并 + combined_df = pd.merge(combined_df, df[['actor_login', value_column_name]], on='actor_login', how='outer') + + + combined_df = combined_df.fillna('') + + return combined_df + +# 从 opendigger 获取数据 +# 获取用户的 opendigger 指标数据 + + +def get_opendigger_data(actor_login, metric, retries=3, delay=5): + # 根据 metric 参数决定请求的 URL + url = f'https://oss.x-lab.info/open_digger/github/{actor_login}/{metric}.json' + + # 重试机制,只对请求异常进行重试 + for attempt in range(retries): + try: + response = requests.get(url, timeout=30) + + # 如果请求成功,但状态码非 200 + if response.status_code != 200: + # print(f"Error: Received status code {response.status_code} for {url}, not retrying.") + return [] # 如果状态码不为 200,直接返回空数据 + + # 如果请求成功且状态码为 200 + data = response.json() # 尝试解析 JSON + result = [] + for key, value in data.items(): + if len(key) == 7: # 确保键是日期格式(例如:2020-01) + result.append({'actor_login': actor_login, 'month': key, metric: value}) + return result + + except requests.exceptions.RequestException as e: + # 捕获网络请求相关的异常,如连接错误、超时等 + print(f"RequestException: {e}, retrying... ({attempt + 1}/{retries})") + time.sleep(delay) # 等待一段时间后重试 + + except ValueError as e: + # 捕获 JSON 解码错误 + print(f"JSON Decode Error: {e} for {url}") + return [] + + # 如果重试都失败了,返回空数据 + print(f"Failed to fetch data after {retries} attempts for {url}") + return [] + + + + +# 获取所有用户的 opendigger 指标数据 +def get_all_user_opendigger_data(user_list): + opendigger_failed_user = [] + activity_data = [] + openrank_data = [] + + for actor_login in user_list: + # 获取用户的活动数据 + activities = get_opendigger_data(actor_login, 'activity') + activity_data.extend(activities) + + # 获取用户的 OpenRank 数据 + openranks = get_opendigger_data(actor_login, 'openrank') + openrank_data.extend(openranks) + + if activities == [] and openranks == []: + opendigger_failed_user.append(actor_login) + + # 转换为 DataFrame + activity_df = pd.DataFrame(activity_data).astype(str) + openrank_df = pd.DataFrame(openrank_data).astype(str) + + opendigger_failed_user = pd.DataFrame(opendigger_failed_user, columns=['actor_login']) + + opendigger_failed_user.to_csv('dashboard/developer/data/failed_user_opendigger.csv', index=False) + + # 按照 actor_login 和 month 进行外连接 + combined_df = pd.merge(activity_df, openrank_df, on=['actor_login', 'month'], how='outer') + combined_df = combined_df.fillna('') + + return combined_df + + + + + + + diff --git a/dashboard/developer/script/get_data_github_api.py b/dashboard/developer/script/get_data_github_api.py new file mode 100644 index 0000000..c77435d --- /dev/null +++ b/dashboard/developer/script/get_data_github_api.py @@ -0,0 +1,145 @@ +import pandas as pd +import requests +from time import sleep +import os + +# 从环境变量中获取数据库信息 +TOKEN = os.getenv('GITHUB_TOKEN') + +# GitHub API认证令牌和URL +URL = "https://api.github.com/graphql" +HEADERS = { + "Authorization": f"Bearer {TOKEN}", + "Content-Type": "application/json", +} + +# 结果保存文件路径 +OUTPUT_FILE = "dashboard/developer/data/user_api_info.csv" +FAILED_USER_FILE = 'dashboard/developer/data/failed_user_github_api.csv' + + +def get_existing_usernames(): + """检查并读取已存在的用户数据""" + if pd.io.common.file_exists(OUTPUT_FILE): + existing_data = pd.read_csv(OUTPUT_FILE) + return existing_data['actor_login'].tolist() + else: + # 创建一个空文件并写入标题行 + columns = ["actor_login", "actor_id", "followers", "following", "totalRepositories", "totalRepositoriesContributedTo", + "totalStarredRepositories", "totalGists", "tatal_repo_stars", "total_forks", "totalCommitContributions", + "totalPullRequestContributions", "totalIssueContributions", "totalRepositoryContributions", + "totalPullRequestReviewContributions", "totalRepositoriesWithContributedIssues", + "totalRepositoriesWithContributedPullRequests", "totalRepositoriesWithContributedCommits", + "repositoryDiscussions", "pullRequests", "issues", "organizations"] + pd.DataFrame(columns=columns).to_csv(OUTPUT_FILE, index=False) + return [] + + +def get_user_data(username): + """获取用户数据""" + query = f""" + {{ + user(login: "{username}") {{ + login + databaseId + followers {{ totalCount }} + following {{ totalCount }} + repositories(first: 100) {{ + totalCount + nodes {{ + name + stargazerCount + forkCount + }} + }} + repositoriesContributedTo(first: 100) {{ + totalCount + nodes {{ + name + owner {{ login }} + }} + }} + starredRepositories(first: 100) {{ totalCount }} + gists {{ totalCount }} + contributionsCollection {{ + totalCommitContributions + totalPullRequestContributions + totalIssueContributions + totalRepositoryContributions + totalPullRequestReviewContributions + totalRepositoriesWithContributedIssues + totalRepositoriesWithContributedPullRequests + totalRepositoriesWithContributedCommits + }} + repositoryDiscussions(first: 100) {{ totalCount }} + pullRequests(first: 100) {{ totalCount }} + issues(first: 100) {{ totalCount }} + organizations(first: 100) {{ totalCount }} + }} + }} + """ + try: + response = requests.post(URL, json={"query": query}, headers=HEADERS, timeout=30) + response.raise_for_status() # 会抛出异常,如果响应码不是200 + return response.json().get("data", {}).get("user", None) + except requests.exceptions.RequestException as e: + print(f"请求 {username} 的数据失败: {e}") + return None + + +def save_user_info(user_info): + """保存用户数据""" + if user_info: + with open(OUTPUT_FILE, mode="a", newline='', encoding='utf-8') as f: + pd.DataFrame([user_info]).to_csv(f, index=False, header=False) + + +def get_data_from_graph_ql(usernames): + failed_user = [] + existing_usernames = get_existing_usernames() + + for username in usernames: + if username in existing_usernames: + # print(f"{username} 已存在,跳过处理") + continue + + data = get_user_data(username) + if data: + repos = data.get("repositories", {}).get("nodes", []) + user_info = { + "actor_login": data.get("login", ""), + "actor_id": data.get("databaseId", ""), + "followers": data.get("followers", {}).get("totalCount", 0), + "following": data.get("following", {}).get("totalCount", 0), + "totalRepositories": data.get("repositories", {}).get("totalCount", 0), + "totalRepositoriesContributedTo": data.get("repositoriesContributedTo", {}).get("totalCount", 0), + "totalStarredRepositories": data.get("starredRepositories", {}).get("totalCount", 0), + "totalGists": data.get("gists", {}).get("totalCount", 0), + "tatal_repo_stars": sum(repo.get("stargazerCount", 0) for repo in repos), + "total_forks": sum(repo.get("forkCount", 0) for repo in repos), + "totalCommitContributions": data.get("contributionsCollection", {}).get("totalCommitContributions", 0), + "totalPullRequestContributions": data.get("contributionsCollection", {}).get("totalPullRequestContributions", 0), + "totalIssueContributions": data.get("contributionsCollection", {}).get("totalIssueContributions", 0), + "totalRepositoryContributions": data.get("contributionsCollection", {}).get("totalRepositoryContributions", 0), + "totalPullRequestReviewContributions": data.get("contributionsCollection", {}).get("totalPullRequestReviewContributions", 0), + "totalRepositoriesWithContributedIssues": data.get("contributionsCollection", {}).get("totalRepositoriesWithContributedIssues", 0), + "totalRepositoriesWithContributedPullRequests": data.get("contributionsCollection", {}).get("totalRepositoriesWithContributedPullRequests", 0), + "totalRepositoriesWithContributedCommits": data.get("contributionsCollection", {}).get("totalRepositoriesWithContributedCommits", 0), + "repositoryDiscussions": data.get("repositoryDiscussions", {}).get("totalCount", 0), + "pullRequests": data.get("pullRequests", {}).get("totalCount", 0), + "issues": data.get("issues", {}).get("totalCount", 0), + "organizations": data.get("organizations", {}).get("totalCount", 0), + } + + save_user_info(user_info) + # logging.info(f"获取 {username} 的数据成功") + else: + failed_user.append(username) + print(f"获取 {username} 的数据失败") + + if failed_user: + failed_df = pd.DataFrame(failed_user, columns=['actor_login']) + failed_df.to_csv(FAILED_USER_FILE, index=False) + + print("用户数据已保存到 user_api_info.csv 文件") + diff --git a/dashboard/developer/script/update_data.py b/dashboard/developer/script/update_data.py new file mode 100644 index 0000000..56661a3 --- /dev/null +++ b/dashboard/developer/script/update_data.py @@ -0,0 +1,154 @@ +import get_data +import get_data_github_api +import pandas as pd + +#获取并保存数据 +def fetch_and_save_data(query, user_logins, data_name, csv_path, columns): + data = get_data.query_clickhouse(user_logins, query, columns) + data.to_csv(csv_path, index=False, encoding='utf-8') + get_data.save_to_clickhouse(data_name, data) + +# 一、从 openleaderboard api 获取上个月 top 300 的user_list +user_list = get_data.get_openleaderboard_user_list() +user_list.to_csv('dashboard/developer/data/user_list.csv', index=False, encoding='utf-8') +get_data.save_to_clickhouse('user_list', user_list) +user_logins = user_list['actor_login'] + +# 二、从 opendigger api 获取活跃度和 openrank +user_openrank_activity_by_month = get_data.get_all_user_opendigger_data(user_logins) +user_openrank_activity_by_month.to_csv('dashboard/developer/data/user_openrank_activity_by_month.csv', index=False, encoding='utf-8') +get_data.save_to_clickhouse('user_openrank_activity_by_month', user_openrank_activity_by_month) + +# 三、从 github graphql api 获取数据 +get_data_github_api.get_data_from_graph_ql(user_logins) +user_api_info = pd.read_csv("dashboard/developer/data/user_api_info.csv") +get_data.save_to_clickhouse('user_api_info', user_api_info) + +# 四、从 clickhouse 获取数据 + +# 1. 开发者近期事件列表 user_latest_events_list +query_events_list = """ +SELECT actor_login, repo_name, CAST(`type` AS String) AS type, action, created_at +FROM ( + SELECT actor_login, + repo_name, + `type`, + action, + created_at, + ROW_NUMBER() OVER (PARTITION BY actor_id ORDER BY created_at DESC) AS row_num + FROM events + WHERE platform = 1 + AND created_at > now() - INTERVAL 6 MONTH + AND actor_login IN ('{user_batch_list}') + ) AS ActorRecentEvents +WHERE row_num <= 30 +ORDER BY actor_login, row_num; +""" +fetch_and_save_data(query_events_list, user_logins, 'user_latest_events_list', + 'dashboard/developer/data/user_latest_events_list.csv', + ['actor_login', 'repo_name', 'type', 'action', 'created_at']) + +# 2. 开发者近期活跃仓库列表 user_top10_repo_list +query_repo_list = """ +SELECT actor_login, repo_name, event_count +FROM ( + SELECT actor_login, + repo_name, + COUNT(*) AS event_count, + ROW_NUMBER() OVER (PARTITION BY actor_login ORDER BY COUNT(*) DESC) AS row_num + FROM events + WHERE created_at > NOW() - INTERVAL 6 MONTH + AND platform = 1 + AND actor_login IN ('{user_batch_list}') + GROUP BY actor_login, repo_name + ) AS ranked +WHERE row_num <= 10 +ORDER BY actor_login, row_num; +""" +fetch_and_save_data(query_repo_list, user_logins, 'user_top10_repo_list', + 'dashboard/developer/data/user_top10_repo_list.csv', + ['actor_login', 'repo_name', 'event_count']) + +# 3. 开发者 issue 情况 user_issue_info_by_month +query_issue = ''' +SELECT actor_login, + date_trunc('month', created_at) AS month, + COUNT(CASE + WHEN action IN ('opened', 'reopened') THEN 1 + ELSE NULL + END) AS opened_count, + COUNT(CASE + WHEN action = 'closed' THEN 1 + ELSE NULL + END) AS closed_count, + COUNT(CASE + WHEN action IN ('opened', 'reopened') THEN 1 + ELSE NULL + END) - COUNT(CASE + WHEN action = 'closed' THEN 1 + ELSE NULL + END) AS waited_count +FROM opensource.events +WHERE type = 7 + AND platform = 1 + AND actor_login IN ('{user_batch_list}') +GROUP BY actor_login, month +ORDER BY actor_login, month; +''' +fetch_and_save_data(query_issue, user_logins, 'user_issue_info_by_month', + 'dashboard/developer/data/user_issue_info_by_month.csv', + ['actor_login', 'month', 'opened_count', 'closed_count', 'waited_count']) + +# 4. 开发者 pr 情况 user_pr_info_by_month +query_pr = ''' +SELECT actor_login, + date_trunc('month', created_at) AS month, + COUNT(CASE + WHEN action IN ('opened', 'reopened') THEN 1 + ELSE NULL + END) AS opened_count, + COUNT(CASE + WHEN pull_merged_at IS NOT NULL THEN 1 + ELSE NULL + END) AS merged_count, + COUNT(CASE + WHEN pull_merged_at IS NULL AND action = 'closed' THEN 1 + ELSE NULL + END) AS closed_count +FROM opensource.events +WHERE type = 10 + and platform = 1 + AND actor_login IN ('{user_batch_list}') +GROUP BY actor_login, month +ORDER BY actor_login, month; +''' +fetch_and_save_data(query_pr, user_logins, 'user_pr_info_by_month', + 'dashboard/developer/data/user_pr_info_by_month.csv', + ['actor_login', 'month', 'opened_count', 'merged_count', 'closed_count']) + +# 5. 开发者 fork 仓库数 user_fork_count +query_fork = ''' +SELECT actor_login, count(*) as fork_count +FROM events +WHERE type = 4 + and platform = 1 + AND actor_login IN ('{user_batch_list}') +GROUP BY actor_login +''' +fetch_and_save_data(query_fork, user_logins, 'user_fork_count', + 'dashboard/developer/data/user_fork_count.csv', + ['actor_login', 'fork_count']) + +# 6. 开发者 merge pr 数 user_pr_merge_count +query_merge_pr = ''' +SELECT actor_login, count(*) as pr_merged_count +FROM opensource.events +WHERE type = 10 + and pull_merged_at IS NOT NULL + and platform = 1 + AND actor_login IN ('{user_batch_list}') +GROUP BY actor_login +''' +fetch_and_save_data(query_merge_pr, user_logins, 'user_pr_merge_count', + 'dashboard/developer/data/user_pr_merge_count.csv', + ['actor_login', 'pr_merged_count']) From bb3c9ba02b1265961daf943196c535210fe51c9f Mon Sep 17 00:00:00 2001 From: wj23027 <809711241@qq.com> Date: Wed, 4 Dec 2024 20:08:39 +0800 Subject: [PATCH 2/2] ci:modify updateDashboardRepositoryData.yml --- .github/workflows/updateDashboardRepositoryData.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/updateDashboardRepositoryData.yml b/.github/workflows/updateDashboardRepositoryData.yml index 5448339..5639b75 100644 --- a/.github/workflows/updateDashboardRepositoryData.yml +++ b/.github/workflows/updateDashboardRepositoryData.yml @@ -35,3 +35,4 @@ jobs: dashboardDB_HOST: ${{ secrets.DASHBOARD_DB_HOST }} dashboardDB_USER: ${{ secrets.DASHBOARD_DB_USER }} dashboardDB_PASSWORD: ${{ secrets.DASHBOARD_DB_PASSWORD }} + github_TOKEN: ${{ secrets.GITHUB_TOKEN }}