Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify script to include active portfolio sample #14

Merged
merged 2 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/grants_comparison/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
/comparison.csv
/meshterms_list.txt
/comparison.csv
4 changes: 0 additions & 4 deletions data/grants_comparison/comparison.csv.dvc

This file was deleted.

1 change: 1 addition & 0 deletions data/raw/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/allMeSH_2021.jsonl
/desc2021.xml
/disease_tags_validation_grants.xlsx
/active_grants_last_5_years.csv
4 changes: 4 additions & 0 deletions data/raw/active_grants_last_5_years.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
outs:
- md5: d664be2a9000d44bb0325f364ec20e27
size: 4953477
path: active_grants_last_5_years.csv
19 changes: 14 additions & 5 deletions pipelines/generate_grants/dvc.lock
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
schema: '2.0'
stages:
generate:
cmd: python ../../scripts/create_grants_sample.py --s3-url s3://datalabs-data/dimensions/grants/grants
--num-parquet-files-to-consider 10 --num-samples-per-cat 10 --pre-annotate True
cmd: python scripts/create_xlinear_bertmesh_comparison_csv.py --s3-url s3://datalabs-data/dimensions/grants/grants
--num-parquet-files-to-consider 10 --num-samples-per-cat 10 --mesh-metadata-path
data/raw/desc2021.xml --mesh-terms-list-path data/grants_comparison/meshterms_list.txt
--active-portfolio-path data/raw/active_grants_last_5_years.csv --bertmesh-path
Wellcome/WellcomeBertMesh --bertmesh-thresh 0.5 --pre-annotate-bertmesh --xlinear-path
models/xlinear-0.2.5/model --xlinear-label-binarizer-path models/xlinear-0.2.5/label_binarizer.pkl
--xlinear-thresh 0.2 --pre-annotate-xlinear --output-path data/grants_comparison/comparison.csv
deps:
- path: scripts/create_xlinear_bertmesh_comparison_csv.py
md5: 0a91bf23be4068bdc7c4b7a32d80ff2d
size: 8214
outs:
- path: grants_sample.jsonl
md5: 76bbfd9043e20866382ff9713cba7483
size: 387951
- path: data/grants_comparison/comparison.csv
md5: bc4fd9f4a670409dad07ffd03cf421f1
size: 596654
24 changes: 20 additions & 4 deletions pipelines/generate_grants/dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
vars:
- s3-url: "s3://datalabs-data/dimensions/grants/grants"
- scripts_location: "../../scripts"
- argilla_project_name: "grants"
stages:
generate:
cmd: python ${scripts_location}/create_grants_sample.py --s3-url ${s3-url} --num-parquet-files-to-consider 10 --num-samples-per-cat 10 --pre-annotate True
cmd: >-
python scripts/create_xlinear_bertmesh_comparison_csv.py
--s3-url ${s3-url}
--num-parquet-files-to-consider 10
--num-samples-per-cat 10
--mesh-metadata-path data/raw/desc2021.xml
--mesh-terms-list-path data/grants_comparison/meshterms_list.txt
--active-portfolio-path data/raw/active_grants_last_5_years.csv
--bertmesh-path Wellcome/WellcomeBertMesh
--bertmesh-thresh 0.5
--pre-annotate-bertmesh
--xlinear-path models/xlinear-0.2.5/model
--xlinear-label-binarizer-path models/xlinear-0.2.5/label_binarizer.pkl
--xlinear-thresh 0.2
--pre-annotate-xlinear
--output-path data/grants_comparison/comparison.csv
deps:
- scripts/create_xlinear_bertmesh_comparison_csv.py
wdir: "../.."
outs:
- grants_sample.jsonl
- data/grants_comparison/comparison.csv
19 changes: 19 additions & 0 deletions scripts/create_xlinear_bertmesh_comparison_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def create_comparison_csv(
num_samples_per_cat: int,
mesh_metadata_path: str,
mesh_terms_list_path: str,
active_portfolio_path: str,
active_portfolio_sample: int,
pre_annotate_bertmesh: bool,
bertmesh_path: str,
bertmesh_thresh: float,
Expand Down Expand Up @@ -121,11 +123,23 @@ def create_comparison_csv(
grants_sample = all_grants.groupby("for_first_level_name", group_keys=False).apply(
lambda x: x.sample(min(len(x), num_samples_per_cat))
)
grants_sample["active_portfolio"] = 0

# Add active portfolio
active_grants = pd.read_csv(active_portfolio_path)
active_grants = active_grants[~active_grants["Synopsis"].isna()]
active_grants.sample(frac=1)
active_grants_sample = active_grants.iloc[:active_portfolio_sample]
active_grants_sample = pd.DataFrame({"abstract": active_grants_sample["Synopsis"]})
active_grants_sample["active_portfolio"] = 1
grants_sample = pd.concat([grants_sample, active_grants_sample])

abstracts = grants_sample["abstract"].tolist()
print(f"{len(abstracts)} abstracts to tag")

# Annotate with bertmesh
if pre_annotate_bertmesh:
print("Tagging with bertmesh")
tags = predict_tags_bertmesh(
abstracts,
bertmesh_path,
Expand All @@ -141,6 +155,7 @@ def create_comparison_csv(

# Annotate with xlinear
if pre_annotate_xlinear:
print("Tagging with xlinear")
model = MeshXLinear(
model_path=xlinear_path,
label_binarizer_path=xlinear_label_binarizer_path,
Expand Down Expand Up @@ -187,6 +202,8 @@ def create_comparison_csv(
parser.add_argument("--num-samples-per-cat", type=int, default=10)
parser.add_argument("--mesh-metadata-path", type=str)
parser.add_argument("--mesh-terms-list-path", type=str)
parser.add_argument("--active-portfolio-path", type=str)
parser.add_argument("--active-portfolio-sample", type=int, default=200)
parser.add_argument("--pre-annotate-bertmesh", action="store_true")
parser.add_argument(
"--bertmesh-path", type=str, default="Wellcome/WellcomeBertMesh"
Expand All @@ -206,6 +223,8 @@ def create_comparison_csv(
num_samples_per_cat=args.num_samples_per_cat,
mesh_metadata_path=args.mesh_metadata_path,
mesh_terms_list_path=args.mesh_terms_list_path,
active_portfolio_path=args.active_portfolio_path,
active_portfolio_sample=args.active_portfolio_sample,
pre_annotate_bertmesh=args.pre_annotate_bertmesh,
bertmesh_path=args.bertmesh_path,
bertmesh_thresh=args.bertmesh_thresh,
Expand Down