forked from interpretAMR/datacuration
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAllTheBacteria_functions.R
66 lines (51 loc) · 2.41 KB
/
AllTheBacteria_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Author: Zoe Dyson ([email protected])
# Title: AllTheBacteria_functions.R
# Date: 17/10/2024
# Package management
# List required packages
packages <- c("tidyverse")
# Load packages & install missing packages
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
# atb_amrfp_filter_by_taxa: Extract specific taxa from the AllTheBacteria
# AMRFinderPlus (AMRFP) data (from https://osf.io/zgexh, accessed 17/10/24)
# (via browser, click three dots, download)
#
# Parameters:
# 1. user_taxa = character vector of taxa to be extracted (default = Salmonella)
# 2. atb_amrfp_results = tab delimited file containing complete AMRFP calls
# 3. atb_species_results = tab delimited file containing taxonomic assignments
#
# Returns: data frame of AMRFP hits for specific taxa
#
atb_amrfp_filter_by_taxa <- function(user_taxa=c("Salmonella"), atb_armfp_results_path="AllTheBacteria/AMRFP_results.tsv.gz", atb_armfp_qc_status_path="AllTheBacteria/ATB_AMRFP_status.tsv.gz", atb_species_results_path="AllTheBacteria/ATB_species_calls.tsv.gz"){
# load files
atb_species_results <- read_tsv(atb_species_results_path)
atb_armfp_qc_status <- read_tsv(atb_armfp_qc_status_path)
# create an empty vector for sample accessions
combined_sample_accessions <- NULL
# get complete list of all relevant sample accessions
for (taxa in user_taxa){
# Get sample accessions for all taxa
temp_sample_accessions <- atb_species_results %>%
filter(grepl(taxa, Species)) %>%
select(Sample) %>%
unique() %>%
pull()
# Add accessions to character vector
combined_sample_accessions <- c(temp_sample_accessions, combined_sample_accessions)
}
# nested callback function to read in only those lines containing selected taxa (for chunking)
f <- function(x, pos) subset(x, Name %in% combined_sample_accessions)
# read lines for selected taxa
selected_atb_amrfp <- read_tsv_chunked(file=atb_armfp_results_path, callback=DataFrameCallback$new(f), chunk_size = 10000)
selected_atb_amrfp <- selected_atb_amrfp %>%
left_join(atb_species_results, by=c("Name"="Sample")) %>%
left_join(atb_armfp_qc_status, by=c("Name"="sample"))
# return data frame for selected taxa
return(selected_atb_amrfp)
}