-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Results.R script for generating data for Labware 8 #51
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env Rscript | ||
# Author: Olinares-perdomo | ||
# Usage: Extracting the SRA, Biosample, GenBank and GISAID accession information in order to \ | ||
# generate the Attributes.txt file, which will be sent to AWS/Lab Ware 8. | ||
|
||
# note: The file from GISAID "gisaid_hcov-19_xxxx.tsv" needs to be download before running this script | ||
|
||
# Parameter: Run Name ; i.e. Rscript Attributes_V1.1.R UT-VH0770-250214 | ||
# last modified on 25/02 | ||
|
||
library(readr);library(lubridate); | ||
suppressPackageStartupMessages(library("dplyr")) | ||
suppressPackageStartupMessages(library("data.table")) | ||
library(utils) | ||
library(httr); | ||
library(rentrez); | ||
library(dplyr); | ||
library(yaml) | ||
|
||
config<-read_yaml("config.yml") | ||
base_path<-config$paths$covidseq_base | ||
|
||
args = commandArgs(trailingOnly=T) | ||
args[1]<-c("UT-VH00770-250214") | ||
full_path_run<- file.path(base_path,args[1],"covidseq_output", "Logs_Intermediates","SampleSheetValidation","SampleSheet_Intermediate.csv") | ||
full_path_pango<- file.path(base_path,args[1],"CecretPangolin","cecret_results.csv") | ||
runPath<-file.path(base_path,args[1]); | ||
|
||
# Extracting Clarity ID form SampleSheet and passed samples from Cecret wf | ||
df1<-read.csv(full_path_run) | ||
x = apply(df1,1, function(x) {any(grepl(x, pattern = "Patient")|grepl(x, pattern = "UT-UPHL")) }) | ||
IDs<-as.data.frame(cbind((cbind(df1[x, ][,c(1)],df1[x, ][,c(3)])))) | ||
Clarity_IDs<-IDs[,c(1,2)]; Clarity_IDs | ||
dataPango<-read.csv(full_path_pango)[,c("sample_id","sample")] | ||
Clarity_IDs_PASSED<-merge(dataPango,Clarity_IDs, by.x ="sample_id", by.y ="V1", all = FALSE)[3] | ||
|
||
# Retrieving SRA and Biosample Accessions from NCBI/SRA DB using rentrez/sra | ||
SRRv<-vector(); SAMNv<-vector();sampleID<-vector() | ||
cat("\rPlease wait, Retrieving SRA and Biosample Accessions from NCBI/SRA DB using rentrez") | ||
for (i in 1:nrow(Clarity_IDs_PASSED)){ | ||
sample<-Clarity_IDs_PASSED[i,1]; | ||
search_results<-entrez_search(db = "sra", term = sample, retmax = 5) | ||
if(length(search_results$ids)>0){ | ||
metadata<-entrez_fetch(db ="sra", id = search_results$ids, rettype = "runinfo", retmodel ="text") | ||
meta<-strsplit(metadata[[1]],"[\n,]")[[1]] | ||
} else { | ||
cat("No SRA Accession, and Biosample Accession were found, NCBI/SRA has not processed the submission yet, Please try again later") | ||
} | ||
SRRv[i]<-meta[48]; SAMNv[i]<-meta[72]; sampleID[i]<-sample | ||
# 48 and 72 are the cell location for SRR and SAMN respectively in the metadata table obtained from rentrez/NCBI/SRA DB | ||
} | ||
SRA_data <- as.data.frame(cbind(as.character(sampleID), as.character(SAMNv),as.character(SRRv))) | ||
colnames(SRA_data)<-c("Clarity ID", "SRA Accession", "Biosample Accession") | ||
cat("\rData from NCBI/SRA was retrieved sucesseful") | ||
|
||
# Retrieving GenBank Accession from GenBank DB using rentrez/nuccore | ||
cat("Wait, Retrieving GenBank Accessions from NCBI/GenBank DB using rentrez") | ||
GenBank_Accession<-vector();SRRv<-vector(); SAMNv<-vector();sampleID<-vector() | ||
for (i in 1:nrow(Clarity_IDs_PASSED)){ | ||
sample<-Clarity_IDs_PASSED[i,1]; | ||
search_results<-entrez_search(db = "nuccore", term = sample, retmax = 5) | ||
if(length(search_results$ids)>0){ | ||
genbank_data <- entrez_fetch(db = "nuccore", id = search_results$ids, rettype = "gb", retmode = "text") | ||
# writeLines(genbank_data, "genbank_data.gb") | ||
meta<-strsplit(genbank_data[[1]],"[\n, ]")[[1]] | ||
} else { | ||
cat("No GenBank Accession, NCBI/GenBank has not processed the submission yet, Please try again later") | ||
} | ||
GenBank_Accession[i]<-meta[8]; sampleID[i]<-sample | ||
# 8 is the cell location for GenBank accession in the metadata table obtained from rentrez/NCBI/GenBank DB | ||
} | ||
GenBank_data <- as.data.frame(cbind(as.character(sampleID), as.character(GenBank_Accession))) | ||
colnames(GenBank_data)<-c("Clarity ID", "GenBank Accession") | ||
SRA_GenBak_data<-merge(SRA_data,GenBank_data, by.x = "Clarity ID", by.y ="Clarity ID", all.x = TRUE) | ||
|
||
# GISAID Accessions: Note: Data from GISAID is typically accessed via their portal an must be downloaded manually. Once downloaded, this script | ||
# will automatically detect the GISAID file and extract the assigned IDs from it. | ||
GISAID_file<-rownames(fileSnapshot(runPath,pattern=glob2rx("gisaid*.tsv"))$info[which.max(fileSnapshot(runPath,pattern=glob2rx("gisaid*.tsv"))$info$mtime),]) | ||
GISAID_path<-paste(runPath,GISAID_file,sep="/") | ||
GISAID_data <- read.table(GISAID_path,header=TRUE,sep="\t", stringsAsFactors=FALSE,comment.char="", fill = T,quote="\"")[,c("Virus.name","Accession.ID")] | ||
GISAID_data$Virus.name<-gsub("^hCoV-19/USA/(.*)/2025","\\1",GISAID_data$Virus.name) | ||
colnames(GISAID_data)<-c("Clarity ID","GenBank Accesion") | ||
Attributes<-merge(SRA_GenBak_data,GISAID_data, by.x ="Clarity ID", by.y = "Clarity ID", all.x = T) | ||
colnames(Attributes)<-c("LIMS_TEST_ID","SRA Accession","Biosample Accession","GenBank Accession","GISAID Accesion") | ||
|
||
# Atributes.txt file generation | ||
# adding to Attributes the samples that failed. All samples passed and failed must be submitted to Lab Ware 8 | ||
Atributes<-merge(Clarity_IDs[2], Attributes, by.x ="V2", by.y = "LIMS_TEST_ID", all.x = TRUE) | ||
colnames( Atributes)<-c("LIMS_TEST_ID", "SRA Accession", "Biosample Accession", "GenBank Accession", "GISAID Accession") | ||
|
||
Atributes$"GISAID Accession"[is.na(Atributes$"GISAID Accession")]= "not Submitted" | ||
Atributes$"SRA Accession"[is.na(Atributes$"SRA Accession")]= "not Submitted" | ||
Atributes$"Biosample Accession"[is.na(Atributes$"Biosample Accession")]= "not Submitted" | ||
Atributes$"GenBank Accession"[is.na(Atributes$"GenBank Accession")]= "not Submitted" | ||
|
||
Attibuttes<-Atributes %>% mutate("Cluster Code" = "") | ||
|
||
write.table(Attibuttes,paste(runPath,"/",args[1],"_6Attributes.txt",sep=""), sep = ',', row.names = FALSE, col.names = TRUE, eol="\r\n", quote = FALSE) | ||
cat("Attributes.txt file for Labware 8 is complete and can be found at",runPath,"\n") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/usr/bin/env Rscript | ||
# Author: OLinares-Perdomo | ||
# Usage: Results.txt file for LW8 (The ParFastas script and Cecret-wf must be executed before) | ||
# Parameter: Run Name ; i.e. Combining_Results.R UT-VH0770-250110 | ||
# last modified on 2025-01-15 | ||
|
||
library("data.table");library("dplyr");library("utils"); | ||
library("lubridate");find("read.table");library("readr"); | ||
Comment on lines
+7
to
+8
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these installed in prod already? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The read.table and utils packages are part of the base R, while the non-base packages lubridate, ready and dplyr have been successfully installed in the production environment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. readr, sorry for the typo in the previous response. |
||
|
||
config<-read_yaml("config.yml") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we going to be editing the config file a lot? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't anticipate frequent edits to the config file for this particular script. However, if other team members need to add new lines or modify the existing ones, for their scripts, it can be updated as needed to accommodate those changes |
||
base_path<-config$paths$covidseq_base | ||
base_path | ||
args = commandArgs(trailingOnly=T); | ||
# detecting sequencer type (VH00770, VH01885 or A01290) | ||
if(nchar(args[1])==17){date<-ymd(substr(args[1], 12, 17))} else{date<-ymd(substr(args[1], 11, 16))}; | ||
runPath <-paste(base_path,args[1],sep="/"); | ||
|
||
# reading the SampleSheat | ||
df1 = read.csv(paste(runPath,"covidseq_output/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv" ,sep = "/")); | ||
x = apply(df1,1, function(x) {any(grepl(x, pattern = "Patient")|grepl(x, pattern = "UT-UPHL")) }); | ||
Matrix1<-as.data.frame(cbind((cbind(df1[x, ][,c(1)],df1[x, ][,c(3)])),"run name"=args[1])); | ||
# getting results from Cecret wf | ||
Matrix2<-read.csv(paste(runPath,"CecretPangolin/cecret_results.csv",sep ="/"))[,c("sample","pangolin_lineage","nextclade_clade","num_total")] | ||
# Merging ss and resuts from Cecret | ||
results<-merge(Matrix1,Matrix2, by.x="V1", by.y= "sample", all.x=T); | ||
colnames(results)<-c("LIMS_TEST_ID","Submission ID","Run Name","Lineage","Nextclade clade","Non-Ambiguous Bases"); | ||
results$Lineage[is.na(results$Lineage)]= "Not able to be sequenced" | ||
results$"Nextclade clade"[is.na(results$"Nextclade clade")]= "Clade not detected" | ||
results$"Non-Ambiguous Bases"[is.na(results$"Non-Ambiguous Bases")]= "0" | ||
# ordering | ||
results<-results %>% select(LIMS_TEST_ID, "Run Name","Submission ID","Non-Ambiguous Bases","Nextclade clade","Lineage"); | ||
# saving results | ||
write.table(results,paste("/Volumes/NGS/Analysis/covidseq/",args[1],"/",args[1], | ||
"FinalResults.txt",sep=""), sep = ',', row.names = F, col.names = T, eol="\r\n", quote = F) | ||
|
||
write.table(results,paste("/Volumes/NGS/Analysis/covidseq/",args[1],"/",args[1], | ||
"FinalResults.csv",sep=""), sep = ',', row.names = F, col.names = T, eol="\r\n", quote = F) | ||
|
||
Comment on lines
+32
to
+38
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are there two files? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One is the "Results.txt" for Lab Ware 8, the .csv file is for downstair analysis for the ngs and other scripts that are needed, for now. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
paths: | ||
covidseq_base: "/Volumes/NGS/Analysis/covidseq" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this need to be changed for each run?