From 2d6ffb60697109767da05d5b6bd613caf407b761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oriol=20L=C3=B3pez-Doriga?= Date: Tue, 13 Feb 2024 21:15:32 +0100 Subject: [PATCH] added mongodb service --- README.md | 11 +++++-- conf/conf.py | 14 ++++++--- docker-compose.yml | 23 +++++++++++++- genomicVariations_vcf.py | 66 ++++++++++++++++++++-------------------- requirements.txt | 1 + 5 files changed, 74 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index cbb512c..890e082 100644 --- a/README.md +++ b/README.md @@ -49,16 +49,21 @@ The **output_docs_folder** sets the folder where your final .json files will be #### VCF conversion config parameters The **num_variants** is the variable you need to write in case you are executing the vcf conversor (genomicVariations_vcf.py). This will tell the script how many vcf lines will be read and converted from the file(s). The **reference_genome** is the genome reference your the tool is using to map the position of the chromosomes. +The **allele_frequency** let's you set a threshold for the allele frequency of the variants you want to convert from the vcf file. -### Converting data from .vcf (.vcf.gz) file +### Converting data from .vcf.gz file -To convert data from .vcf (.vcf.gz) to .json, you will have to copy all the files you want to convert inside the [files_to_read folder](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/files/vcf/files_to_read). +To convert data from .vcf.gz to .json, you will need to copy all the files you want to convert inside the [files_to_read folder](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/files/vcf/files_to_read). You will need to provide one .vcf.gz file file and save it in this folder. ```bash docker exec -it ri-tools python genomicVariations_vcf.py ``` -This will generate the final .json file that is Beacon Friendly Format in the output_docs folder with the name of the collection followed by .json extension, e.g. genomicVariations.json. +After that, if needed, export your documents from mongoDB to a .json file using this command: +```bash +docker exec ri-tools-mongo mongoexport --jsonArray --uri "mongodb://root:example@127.0.0.1:27017/beacon?authSource=admin" --collection genomicVariations | jq 'del(.[]._id)' > genomicVariations.json +``` +This will generate the final .json file that is Beacon Friendly Format. Bear in mind that this time, the file will be saved in the directory you are located, so if you want to save it in the output_docs folder, add it in the path of the mongoexport. ### Creating the .csv file (if metadata or not having a vcf file for genomicVariations) diff --git a/conf/conf.py b/conf/conf.py index e1a9f76..e9d94f0 100644 --- a/conf/conf.py +++ b/conf/conf.py @@ -1,10 +1,16 @@ #### Input and Output files config parameters #### -csv_filename='./csv/examples/genomicVariations.csv' +csv_filename='./csv/output3.csv' output_docs_folder='./output_docs/' #### VCF Conversion config parameters #### -num_variants=1000000 +num_variants=10000000 +allele_frequency=1 # introduce float number, leave 1 if you want to convert all the variants reference_genome='GRCh38' # Choose one between NCBI36, GRCh37, GRCh38 - - +### MongoDB parameters ### +database_host = 'mongo' +database_port = 27017 +database_user = 'root' +database_password = 'example' +database_name = 'beacon' +database_auth_source = 'admin' diff --git a/docker-compose.yml b/docker-compose.yml index 1c82dce..9c0bb80 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,9 @@ version: "3.8" +networks: + my-app-network: + external: true + services: beacon-ri-tools-v2: @@ -11,4 +15,21 @@ services: - ./conf:/usr/src/app/conf - ./scripts/datasheet/conf:/usr/src/app/scripts/datasheet/conf - ./files/vcf/files_to_read:/usr/src/app/files/vcf/files_to_read - - ./csv:/usr/src/app/csv \ No newline at end of file + - ./csv:/usr/src/app/csv + networks: + - my-app-network + + db: + container_name: ri-tools-mongo + image: mongo:5 + hostname: mongo + ports: + - 27017:27017 + environment: + MONGO_INITDB_ROOT_USERNAME: root + MONGO_INITDB_ROOT_PASSWORD: example + MONGO_INITDB_DATABASE: beacon + volumes: + - ./mongo-init/:/docker-entrypoint-initdb.d/:ro + networks: + - my-app-network \ No newline at end of file diff --git a/genomicVariations_vcf.py b/genomicVariations_vcf.py index ba9795c..6139224 100644 --- a/genomicVariations_vcf.py +++ b/genomicVariations_vcf.py @@ -7,23 +7,38 @@ import uuid import json import gc +from pymongo.mongo_client import MongoClient + +client = MongoClient( + #"mongodb://127.0.0.1:27017/" + "mongodb://{}:{}@{}:{}/{}?authSource={}".format( + conf.database_user, + conf.database_password, + conf.database_host, + conf.database_port, + conf.database_name, + conf.database_auth_source, + ) + ) with open('files/deref_schemas/genomicVariations.json') as json_file: dict_properties = json.load(json_file) def generate(dict_properties): - byt_combined=b'[' total_dict =[] i=1 l=0 - num_rows=conf.num_variants + for vcf_filename in glob.glob("files/vcf/files_to_read/*.vcf.gz"): print(vcf_filename) vcf = VCF(vcf_filename, strict_gt=True) my_target_list = vcf.samples - + count=0 + + + num_rows=conf.num_variants pbar = tqdm(total = num_rows) for v in vcf: try: @@ -33,7 +48,7 @@ def generate(dict_properties): try: allele_frequency = v.INFO.get('AF') if isinstance(allele_frequency, float): - if allele_frequency > 0.1: continue + if allele_frequency > conf.allele_frequency: continue except Exception: pass @@ -49,8 +64,13 @@ def generate(dict_properties): dict_to_xls['variation|referenceBases'] = ref try: dict_to_xls['variation|variantType'] = v.INFO.get('VT') + if v.INFO.get('VT') is None: + if len(alt[0]) == len(ref): + dict_to_xls['variation|variantType']='SNP' + else: + dict_to_xls['variation|variantType']='INDEL' except Exception: - pass + dict_to_xls['variation|variantType']='UNKNOWN' #print(v.INFO.get('ANN')) if v.INFO.get('ANN') is not None: annot = v.INFO.get('ANN') @@ -400,19 +420,12 @@ def generate(dict_properties): total_dict.append(definitivedict) if i == num_rows: - s = json.dumps(total_dict) - s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']','') - s = s.encode('utf-8') - byt_combined+=s+b']' + + client.beacon.genomicVariations.insert_many(total_dict) pbar.update(1) break - elif (i/25000).is_integer(): - s = json.dumps(total_dict) - #print(s) - s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']',',') - s = s.encode('utf-8') - byt_combined+=s - del s + elif (i/10000).is_integer(): + client.beacon.genomicVariations.insert_many(total_dict) del definitivedict del total_dict gc.collect() @@ -424,30 +437,17 @@ def generate(dict_properties): i+=1 if i != num_rows: - s = json.dumps(total_dict) - s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']','') - s = s.encode('utf-8') - - byt_combined+=s+b']' + client.beacon.genomicVariations.insert_many(total_dict) - #print(byt_combined) - - total_dict=json.loads(byt_combined.decode('utf-8')) pbar.close() - return total_dict, i, l - -dict_generado, total_i, l=generate(dict_properties) - -output = conf.output_docs_folder + 'genomicVariations.json' - + return i, l +total_i, l=generate(dict_properties) -with open(output, 'w') as f: - json.dump(dict_generado, f) if total_i-l > 0: - print('Successfully converted {} registries into {}'.format(total_i-l, output)) + print('Successfully inserted {} records into beacon'.format(total_i-l)) else: print('No registries found.') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4eb5d7d..5fc1999 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ python-dateutil==2.8.2 tqdm==4.66.1 urllib3==2.0.7 cyvcf2==0.30.28 +pymongo==4.6.1