Skip to content

Commit

Permalink
added mongodb service
Browse files Browse the repository at this point in the history
  • Loading branch information
costero-e committed Feb 13, 2024
1 parent 29e7a37 commit 2d6ffb6
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 41 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,21 @@ The **output_docs_folder** sets the folder where your final .json files will be
#### VCF conversion config parameters
The **num_variants** is the variable you need to write in case you are executing the vcf conversor (genomicVariations_vcf.py). This will tell the script how many vcf lines will be read and converted from the file(s).
The **reference_genome** is the genome reference your the tool is using to map the position of the chromosomes.
The **allele_frequency** let's you set a threshold for the allele frequency of the variants you want to convert from the vcf file.

### Converting data from .vcf (.vcf.gz) file
### Converting data from .vcf.gz file

To convert data from .vcf (.vcf.gz) to .json, you will have to copy all the files you want to convert inside the [files_to_read folder](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/files/vcf/files_to_read).
To convert data from .vcf.gz to .json, you will need to copy all the files you want to convert inside the [files_to_read folder](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/files/vcf/files_to_read).
You will need to provide one .vcf.gz file file and save it in this folder.

```bash
docker exec -it ri-tools python genomicVariations_vcf.py
```
This will generate the final .json file that is Beacon Friendly Format in the output_docs folder with the name of the collection followed by .json extension, e.g. genomicVariations.json.
After that, if needed, export your documents from mongoDB to a .json file using this command:
```bash
docker exec ri-tools-mongo mongoexport --jsonArray --uri "mongodb://root:[email protected]:27017/beacon?authSource=admin" --collection genomicVariations | jq 'del(.[]._id)' > genomicVariations.json
```
This will generate the final .json file that is Beacon Friendly Format. Bear in mind that this time, the file will be saved in the directory you are located, so if you want to save it in the output_docs folder, add it in the path of the mongoexport.

### Creating the .csv file (if metadata or not having a vcf file for genomicVariations)

Expand Down
14 changes: 10 additions & 4 deletions conf/conf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
#### Input and Output files config parameters ####
csv_filename='./csv/examples/genomicVariations.csv'
csv_filename='./csv/output3.csv'
output_docs_folder='./output_docs/'

#### VCF Conversion config parameters ####
num_variants=1000000
num_variants=10000000
allele_frequency=1 # introduce float number, leave 1 if you want to convert all the variants
reference_genome='GRCh38' # Choose one between NCBI36, GRCh37, GRCh38



### MongoDB parameters ###
database_host = 'mongo'
database_port = 27017
database_user = 'root'
database_password = 'example'
database_name = 'beacon'
database_auth_source = 'admin'
23 changes: 22 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
version: "3.8"

networks:
my-app-network:
external: true

services:

beacon-ri-tools-v2:
Expand All @@ -11,4 +15,21 @@ services:
- ./conf:/usr/src/app/conf
- ./scripts/datasheet/conf:/usr/src/app/scripts/datasheet/conf
- ./files/vcf/files_to_read:/usr/src/app/files/vcf/files_to_read
- ./csv:/usr/src/app/csv
- ./csv:/usr/src/app/csv
networks:
- my-app-network

db:
container_name: ri-tools-mongo
image: mongo:5
hostname: mongo
ports:
- 27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
MONGO_INITDB_DATABASE: beacon
volumes:
- ./mongo-init/:/docker-entrypoint-initdb.d/:ro
networks:
- my-app-network
66 changes: 33 additions & 33 deletions genomicVariations_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,38 @@
import uuid
import json
import gc
from pymongo.mongo_client import MongoClient

client = MongoClient(
#"mongodb://127.0.0.1:27017/"
"mongodb://{}:{}@{}:{}/{}?authSource={}".format(
conf.database_user,
conf.database_password,
conf.database_host,
conf.database_port,
conf.database_name,
conf.database_auth_source,
)
)

with open('files/deref_schemas/genomicVariations.json') as json_file:
dict_properties = json.load(json_file)

def generate(dict_properties):
byt_combined=b'['
total_dict =[]
i=1
l=0

num_rows=conf.num_variants

for vcf_filename in glob.glob("files/vcf/files_to_read/*.vcf.gz"):
print(vcf_filename)
vcf = VCF(vcf_filename, strict_gt=True)
my_target_list = vcf.samples

count=0



num_rows=conf.num_variants
pbar = tqdm(total = num_rows)
for v in vcf:
try:
Expand All @@ -33,7 +48,7 @@ def generate(dict_properties):
try:
allele_frequency = v.INFO.get('AF')
if isinstance(allele_frequency, float):
if allele_frequency > 0.1: continue
if allele_frequency > conf.allele_frequency: continue
except Exception:
pass

Expand All @@ -49,8 +64,13 @@ def generate(dict_properties):
dict_to_xls['variation|referenceBases'] = ref
try:
dict_to_xls['variation|variantType'] = v.INFO.get('VT')
if v.INFO.get('VT') is None:
if len(alt[0]) == len(ref):
dict_to_xls['variation|variantType']='SNP'
else:
dict_to_xls['variation|variantType']='INDEL'
except Exception:
pass
dict_to_xls['variation|variantType']='UNKNOWN'
#print(v.INFO.get('ANN'))
if v.INFO.get('ANN') is not None:
annot = v.INFO.get('ANN')
Expand Down Expand Up @@ -400,19 +420,12 @@ def generate(dict_properties):
total_dict.append(definitivedict)

if i == num_rows:
s = json.dumps(total_dict)
s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']','')
s = s.encode('utf-8')
byt_combined+=s+b']'

client.beacon.genomicVariations.insert_many(total_dict)
pbar.update(1)
break
elif (i/25000).is_integer():
s = json.dumps(total_dict)
#print(s)
s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']',',')
s = s.encode('utf-8')
byt_combined+=s
del s
elif (i/10000).is_integer():
client.beacon.genomicVariations.insert_many(total_dict)
del definitivedict
del total_dict
gc.collect()
Expand All @@ -424,30 +437,17 @@ def generate(dict_properties):
i+=1

if i != num_rows:
s = json.dumps(total_dict)
s = s[0].replace('[','') + s[1:-1] + s[-1:].replace(']','')
s = s.encode('utf-8')

byt_combined+=s+b']'
client.beacon.genomicVariations.insert_many(total_dict)


#print(byt_combined)

total_dict=json.loads(byt_combined.decode('utf-8'))

pbar.close()
return total_dict, i, l

dict_generado, total_i, l=generate(dict_properties)

output = conf.output_docs_folder + 'genomicVariations.json'

return i, l

total_i, l=generate(dict_properties)

with open(output, 'w') as f:
json.dump(dict_generado, f)

if total_i-l > 0:
print('Successfully converted {} registries into {}'.format(total_i-l, output))
print('Successfully inserted {} records into beacon'.format(total_i-l))
else:
print('No registries found.')
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ python-dateutil==2.8.2
tqdm==4.66.1
urllib3==2.0.7
cyvcf2==0.30.28
pymongo==4.6.1

0 comments on commit 2d6ffb6

Please sign in to comment.