diff --git a/Attributes_for_labware8.R b/Attributes_for_labware8.R new file mode 100755 index 0000000..9308c5a --- /dev/null +++ b/Attributes_for_labware8.R @@ -0,0 +1,100 @@ +#!/usr/bin/env Rscript +# Author: Olinares-perdomo +# Usage: Extracting the SRA, Biosample, GenBank and GISAID accession information in order to \ +# generate the Attributes.txt file, which will be sent to AWS/Lab Ware 8. + +# note: The file from GISAID "gisaid_hcov-19_xxxx.tsv" needs to be download before running this script + +# Parameter: Run Name ; i.e. Rscript Attributes_V1.1.R UT-VH0770-250214 +# last modified on 25/02 + +library(readr);library(lubridate); +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("data.table")) +library(utils) +library(httr); +library(rentrez); +library(dplyr); +library(yaml) + + config<-read_yaml("config.yml") + base_path<-config$paths$covidseq_base + + args = commandArgs(trailingOnly=T) + args[1]<-c("UT-VH00770-250214") + full_path_run<- file.path(base_path,args[1],"covidseq_output", "Logs_Intermediates","SampleSheetValidation","SampleSheet_Intermediate.csv") + full_path_pango<- file.path(base_path,args[1],"CecretPangolin","cecret_results.csv") + runPath<-file.path(base_path,args[1]); + +# Extracting Clarity ID form SampleSheet and passed samples from Cecret wf + df1<-read.csv(full_path_run) + x = apply(df1,1, function(x) {any(grepl(x, pattern = "Patient")|grepl(x, pattern = "UT-UPHL")) }) + IDs<-as.data.frame(cbind((cbind(df1[x, ][,c(1)],df1[x, ][,c(3)])))) + Clarity_IDs<-IDs[,c(1,2)]; Clarity_IDs + dataPango<-read.csv(full_path_pango)[,c("sample_id","sample")] + Clarity_IDs_PASSED<-merge(dataPango,Clarity_IDs, by.x ="sample_id", by.y ="V1", all = FALSE)[3] + +# Retrieving SRA and Biosample Accessions from NCBI/SRA DB using rentrez/sra + SRRv<-vector(); SAMNv<-vector();sampleID<-vector() + cat("\rPlease wait, Retrieving SRA and Biosample Accessions from NCBI/SRA DB using rentrez") + for (i in 1:nrow(Clarity_IDs_PASSED)){ + sample<-Clarity_IDs_PASSED[i,1]; + search_results<-entrez_search(db = "sra", term = sample, retmax = 5) + if(length(search_results$ids)>0){ + metadata<-entrez_fetch(db ="sra", id = search_results$ids, rettype = "runinfo", retmodel ="text") + meta<-strsplit(metadata[[1]],"[\n,]")[[1]] + } else { + cat("No SRA Accession, and Biosample Accession were found, NCBI/SRA has not processed the submission yet, Please try again later") + } + SRRv[i]<-meta[48]; SAMNv[i]<-meta[72]; sampleID[i]<-sample + # 48 and 72 are the cell location for SRR and SAMN respectively in the metadata table obtained from rentrez/NCBI/SRA DB + } + SRA_data <- as.data.frame(cbind(as.character(sampleID), as.character(SAMNv),as.character(SRRv))) + colnames(SRA_data)<-c("Clarity ID", "SRA Accession", "Biosample Accession") + cat("\rData from NCBI/SRA was retrieved sucesseful") + +# Retrieving GenBank Accession from GenBank DB using rentrez/nuccore + cat("Wait, Retrieving GenBank Accessions from NCBI/GenBank DB using rentrez") + GenBank_Accession<-vector();SRRv<-vector(); SAMNv<-vector();sampleID<-vector() + for (i in 1:nrow(Clarity_IDs_PASSED)){ + sample<-Clarity_IDs_PASSED[i,1]; + search_results<-entrez_search(db = "nuccore", term = sample, retmax = 5) + if(length(search_results$ids)>0){ + genbank_data <- entrez_fetch(db = "nuccore", id = search_results$ids, rettype = "gb", retmode = "text") + # writeLines(genbank_data, "genbank_data.gb") + meta<-strsplit(genbank_data[[1]],"[\n, ]")[[1]] + } else { + cat("No GenBank Accession, NCBI/GenBank has not processed the submission yet, Please try again later") + } + GenBank_Accession[i]<-meta[8]; sampleID[i]<-sample + # 8 is the cell location for GenBank accession in the metadata table obtained from rentrez/NCBI/GenBank DB + } + GenBank_data <- as.data.frame(cbind(as.character(sampleID), as.character(GenBank_Accession))) + colnames(GenBank_data)<-c("Clarity ID", "GenBank Accession") + SRA_GenBak_data<-merge(SRA_data,GenBank_data, by.x = "Clarity ID", by.y ="Clarity ID", all.x = TRUE) + +# GISAID Accessions: Note: Data from GISAID is typically accessed via their portal an must be downloaded manually. Once downloaded, this script +# will automatically detect the GISAID file and extract the assigned IDs from it. + GISAID_file<-rownames(fileSnapshot(runPath,pattern=glob2rx("gisaid*.tsv"))$info[which.max(fileSnapshot(runPath,pattern=glob2rx("gisaid*.tsv"))$info$mtime),]) + GISAID_path<-paste(runPath,GISAID_file,sep="/") + GISAID_data <- read.table(GISAID_path,header=TRUE,sep="\t", stringsAsFactors=FALSE,comment.char="", fill = T,quote="\"")[,c("Virus.name","Accession.ID")] + GISAID_data$Virus.name<-gsub("^hCoV-19/USA/(.*)/2025","\\1",GISAID_data$Virus.name) + colnames(GISAID_data)<-c("Clarity ID","GenBank Accesion") + Attributes<-merge(SRA_GenBak_data,GISAID_data, by.x ="Clarity ID", by.y = "Clarity ID", all.x = T) + colnames(Attributes)<-c("LIMS_TEST_ID","SRA Accession","Biosample Accession","GenBank Accession","GISAID Accesion") + +# Atributes.txt file generation + # adding to Attributes the samples that failed. All samples passed and failed must be submitted to Lab Ware 8 + Atributes<-merge(Clarity_IDs[2], Attributes, by.x ="V2", by.y = "LIMS_TEST_ID", all.x = TRUE) + colnames( Atributes)<-c("LIMS_TEST_ID", "SRA Accession", "Biosample Accession", "GenBank Accession", "GISAID Accession") + + Atributes$"GISAID Accession"[is.na(Atributes$"GISAID Accession")]= "not Submitted" + Atributes$"SRA Accession"[is.na(Atributes$"SRA Accession")]= "not Submitted" + Atributes$"Biosample Accession"[is.na(Atributes$"Biosample Accession")]= "not Submitted" + Atributes$"GenBank Accession"[is.na(Atributes$"GenBank Accession")]= "not Submitted" + + Attibuttes<-Atributes %>% mutate("Cluster Code" = "") + + write.table(Attibuttes,paste(runPath,"/",args[1],"_6Attributes.txt",sep=""), sep = ',', row.names = FALSE, col.names = TRUE, eol="\r\n", quote = FALSE) + cat("Attributes.txt file for Labware 8 is complete and can be found at",runPath,"\n") + diff --git a/Results_LW8.R b/Results_LW8.R new file mode 100755 index 0000000..38a6dd2 --- /dev/null +++ b/Results_LW8.R @@ -0,0 +1,38 @@ +#!/usr/bin/env Rscript +# Author: OLinares-Perdomo +# Usage: Results.txt file for LW8 (The ParFastas script and Cecret-wf must be executed before) +# Parameter: Run Name ; i.e. Combining_Results.R UT-VH0770-250110 +# last modified on 2025-01-15 + +library("data.table");library("dplyr");library("utils"); +library("lubridate");find("read.table");library("readr"); + +config<-read_yaml("config.yml") +base_path<-config$paths$covidseq_base +base_path + args = commandArgs(trailingOnly=T); +# detecting sequencer type (VH00770, VH01885 or A01290) + if(nchar(args[1])==17){date<-ymd(substr(args[1], 12, 17))} else{date<-ymd(substr(args[1], 11, 16))}; + runPath <-paste(base_path,args[1],sep="/"); + + # reading the SampleSheat + df1 = read.csv(paste(runPath,"covidseq_output/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv" ,sep = "/")); + x = apply(df1,1, function(x) {any(grepl(x, pattern = "Patient")|grepl(x, pattern = "UT-UPHL")) }); + Matrix1<-as.data.frame(cbind((cbind(df1[x, ][,c(1)],df1[x, ][,c(3)])),"run name"=args[1])); +# getting results from Cecret wf + Matrix2<-read.csv(paste(runPath,"CecretPangolin/cecret_results.csv",sep ="/"))[,c("sample","pangolin_lineage","nextclade_clade","num_total")] +# Merging ss and resuts from Cecret + results<-merge(Matrix1,Matrix2, by.x="V1", by.y= "sample", all.x=T); + colnames(results)<-c("LIMS_TEST_ID","Submission ID","Run Name","Lineage","Nextclade clade","Non-Ambiguous Bases"); + results$Lineage[is.na(results$Lineage)]= "Not able to be sequenced" + results$"Nextclade clade"[is.na(results$"Nextclade clade")]= "Clade not detected" + results$"Non-Ambiguous Bases"[is.na(results$"Non-Ambiguous Bases")]= "0" +# ordering + results<-results %>% select(LIMS_TEST_ID, "Run Name","Submission ID","Non-Ambiguous Bases","Nextclade clade","Lineage"); +# saving results + write.table(results,paste("/Volumes/NGS/Analysis/covidseq/",args[1],"/",args[1], + "FinalResults.txt",sep=""), sep = ',', row.names = F, col.names = T, eol="\r\n", quote = F) + + write.table(results,paste("/Volumes/NGS/Analysis/covidseq/",args[1],"/",args[1], + "FinalResults.csv",sep=""), sep = ',', row.names = F, col.names = T, eol="\r\n", quote = F) + diff --git a/config.yml b/config.yml new file mode 100755 index 0000000..93e1fb4 --- /dev/null +++ b/config.yml @@ -0,0 +1,2 @@ +paths: + covidseq_base: "/Volumes/NGS/Analysis/covidseq"