diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION index 0eae041..13edf33 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Rpairix Title: Rpairix -Version: 0.0.3 +Version: 0.0.4 Authors@R: person("Soo", "Lee", email = "duplexa@gmail.com", role = c("aut", "cre")) Description: R binder for pairix, tool for querying a pair of genomic ranges in a pairs file (pairix-indexed bgzipped text file) Depends: diff --git a/NAMESPACE b/NAMESPACE index f015e73..a3749e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(px_exists) export(px_keylist) export(px_query) export(px_seq1list) @@ -9,3 +10,4 @@ useDynLib(Rpairix,get_keylist) useDynLib(Rpairix,get_keylist_size) useDynLib(Rpairix,get_lines) useDynLib(Rpairix,get_size) +useDynLib(Rpairix,key_exists) diff --git a/R/px_exists.R b/R/px_exists.R new file mode 100644 index 0000000..3f2207e --- /dev/null +++ b/R/px_exists.R @@ -0,0 +1,30 @@ +#' Check function on pairix-indexed pairs file. +#' +#' This function allows you to check if a key (chr for 1D, chr pair for 2D) exists in a pairs file. +#' +#' @param filename a pairs file, or a bgzipped text file (sometextfile.gz) with an index file sometextfile.gz.px2 in the same folder. +#' @param key a pair of chromosomes in the query string format (e.g. "chr1|chr2"), or a chromosome for a 1D-indexed pairs file (e.g. "chr1"). +#' +#' @keywords pairix check +#' @export px_exists +#' @examples +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' key = "chrX|chrX" +#' res = px_exists(filename, key) +#' print(res) +#' +#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") +#' key = "10|20" +#' res = px_exists(filename, key) +#' print(res) +#' +#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", +#' package="Rpairix") +#' key = "10|20" +#' res = px_exists(filename, key) +#' print(res) +#' @useDynLib Rpairix key_exists +px_exists<-function(filename, key){ + out = .C("key_exists", filename, key, as.integer(0)) + return(out[[3]][1]) +} diff --git a/R/px_keylist.R b/R/px_keylist.R index 26b1358..62b8efd 100644 --- a/R/px_keylist.R +++ b/R/px_keylist.R @@ -7,12 +7,19 @@ #' @keywords pairix query 2D #' @export px_keylist #' @examples +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' res = px_keylist(filename) +#' print(res) +#' #' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") #' res = px_keylist(filename) +#' print(res) +#' #' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", #' package="Rpairix") -#' #' res = px_keylist(filename) +#' print(res) +#' #' @useDynLib Rpairix get_keylist_size get_keylist px_keylist<-function(filename){ # first-round, get the max length and the number of items in the key list. diff --git a/R/px_query.R b/R/px_query.R index a3b9d04..a949b32 100644 --- a/R/px_query.R +++ b/R/px_query.R @@ -10,20 +10,28 @@ #' @keywords pairix query 2D #' @export px_query #' @examples -#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' querystr = "chrX|chrX" +#' res = px_query(filename, querystr) +#' print(res) +#' +#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz", package="Rpairix") #' querystr = "10:1-1000000|20" -#' res = px_query(filename,querystr) +#' res = px_query(filename, querystr) +#' print(res) #' #' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", #' package="Rpairix") #' querystr = "10:1-1000000|20" -#' res = px_query(filename,querystr) +#' res = px_query(filename, querystr) +#' print(res) +#' #' @useDynLib Rpairix get_size get_lines px_query<-function(filename, querystr, max_mem=100000000, stringsAsFactors=FALSE){ # first-round, get the max length and the number of lines of the result. out =.C("get_size", filename, querystr, as.integer(0), as.integer(0), as.integer(0)) - if(out[[5]][1] == -1 ) return(NULL) ## error + if(out[[5]][1] == -1 ) { message("Can't open input file"); return(NULL) } ## error str_len = out[[4]][1] n=out[[3]][1] total_size = str_len * n diff --git a/R/px_seq1list.R b/R/px_seq1list.R index cb97369..5d6b3de 100644 --- a/R/px_seq1list.R +++ b/R/px_seq1list.R @@ -7,8 +7,13 @@ #' @keywords pairix query 2D #' @export px_seq1list #' @examples +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' res = px_seq1list(filename) +#' print(res) +#' #' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") #' res = px_seq1list(filename) +#' print(res) px_seq1list<-function(filename){ seqpairs = px_keylist(filename) seq1_list = unique(sapply(seqpairs,function(xx)strsplit(xx,'|',fixed=T)[[1]][1])) diff --git a/R/px_seq2list.R b/R/px_seq2list.R index 58301d3..5aa6931 100644 --- a/R/px_seq2list.R +++ b/R/px_seq2list.R @@ -7,8 +7,13 @@ #' @keywords pairix query 2D #' @export px_seq2list #' @examples +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' res = px_seq2list(filename) +#' print(res) +#' #' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") #' res = px_seq2list(filename) +#' print(res) px_seq2list<-function(filename){ seqpairs = px_keylist(filename) seq2_list = unique(sapply(seqpairs,function(xx)strsplit(xx,'|',fixed=T)[[1]][2])) diff --git a/R/px_seqlist.R b/R/px_seqlist.R index 16d59d4..fc58d0b 100644 --- a/R/px_seqlist.R +++ b/R/px_seqlist.R @@ -7,8 +7,13 @@ #' @keywords pairix query 2D #' @export px_seqlist #' @examples +#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +#' res = px_seqlist(filename) +#' print(res) +#' #' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") #' res = px_seqlist(filename) +#' print(res) px_seqlist<-function(filename){ seqpairs = px_keylist(filename) seq1_list = unique(sapply(seqpairs,function(xx)strsplit(xx,'|',fixed=T)[[1]][1])) diff --git a/README.md b/README.md index 9eb56ea..1f44d59 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ R --no-site-file --no-environ --no-save --no-restore CMD INSTALL --install-tests To install a specific version, ``` library(devtools) -install_url("https://github.com/4dn-dcic/Rpairix/archive/0.0.3.zip") +install_url("https://github.com/4dn-dcic/Rpairix/archive/0.0.4.zip") ``` @@ -72,43 +72,42 @@ px_seq2list(filename) * The filename is sometextfile.gz and an index file sometextfile.gz.px2 must exist. * The return value is a vector of second chromosomes. +### Check if a chromosome pair (or chromosome, for 1D) exists +``` +px_exists(filename, key) +``` +* The filename is sometextfile.gz and an index file sometextfile.gz.px2 must exist. +* Key is a chromosome pair (or a chromosome for 1D) +* The return value is 1 (exists), 0 (not exist), or -1 (error) ## Example run ``` > library(Rpairix) -> filename = "inst/merged_nodup.tab.chrblock_sorted.txt.gz" -> querystr = "10:1-1000000|20" +> filename = "inst/test_4dn.pairs.gz" +> querystr = "chr10:1-3000000|chr20" > res = px_query(filename,querystr) > print(res) - V1 V2 V3 V4 V5 V6 V7 V8 -1 0 10 624779 1361 0 20 40941397 97868 -2 16 10 948577 2120 16 20 59816485 148396 -> -> keys = px_keylist("inst/merged_nodup.tab.chrblock_sorted.txt.gz") + V1 V2 V3 V4 V5 V6 V7 +1 SRR1658581.51740952 chr10 157600 chr20 167993 - - +2 SRR1658581.33457260 chr10 2559777 chr20 7888262 - + +> keys = px_keylist(filename) > length(keys) -[1] 1239 +[1] 800 > keys[1:10] -[1] "1|1" "1|10" "1|11" "1|12" "1|13" "1|14" "1|15" "1|16" "1|17" "1|18" -> ->chrs = px_seqlist("inst/merged_nodup.tab.chrblock_sorted.txt.gz") ->chrs - [1] "1" "10" "11" "12" "13" - [6] "14" "15" "16" "17" "18" -[11] "19" "2" "20" "21" "22" -[16] "3" "4" "5" "6" "7" -[21] "8" "9" "GL000191.1" "GL000192.1" "GL000193.1" -[26] "GL000194.1" "GL000195.1" "GL000196.1" "GL000197.1" "GL000198.1" -[31] "GL000199.1" "GL000200.1" "GL000201.1" "GL000202.1" "GL000203.1" -[36] "GL000204.1" "GL000205.1" "GL000206.1" "GL000208.1" "GL000209.1" -[41] "GL000210.1" "GL000211.1" "GL000212.1" "GL000213.1" "GL000214.1" -[46] "GL000215.1" "GL000216.1" "GL000217.1" "GL000218.1" "GL000219.1" -[51] "GL000220.1" "GL000221.1" "GL000222.1" "GL000223.1" "GL000224.1" -[56] "GL000225.1" "GL000226.1" "GL000227.1" "GL000228.1" "GL000229.1" -[61] "GL000230.1" "GL000231.1" "GL000232.1" "GL000233.1" "GL000234.1" -[66] "GL000235.1" "GL000236.1" "GL000237.1" "GL000238.1" "GL000239.1" -[71] "GL000240.1" "GL000241.1" "GL000242.1" "GL000243.1" "GL000244.1" -[76] "GL000245.1" "GL000246.1" "GL000247.1" "GL000248.1" "GL000249.1" -[81] "MT" "NC_007605" "X" "Y" + [1] "chr1|chr1" "chr1|chr10" "chr1|chr11" + [4] "chr1|chr12" "chr1|chr13" "chr1|chr14" + [7] "chr1|chr15" "chr1|chr16" "chr1|chr17" +[10] "chr1|chr17_ctg5_hap1" +> chrs = px_seqlist(filename) +> length(chrs) +[1] 82 +> chrs[1:10] + [1] "chr1" "chr1_gl000191_random" "chr1_gl000192_random" + [4] "chr10" "chr11" "chr11_gl000202_random" + [7] "chr12" "chr13" "chr14" +[10] "chr15" +> px_exists(filename, "chr10|chr20") +[1] 1 ``` @@ -122,3 +121,19 @@ document() Individual R functions are written and documented in `R/`. The `src/rpairixlib.c` is the main C source file. Raw data files are under `inst/`. +## Version history +### 0.0.4 +* Function px_exists is now added. +* Source is synced with pairix/pypairix 0.1.1. +* 4dn pairs example is added + +### 0.0.3 +* corrected a typo in README + +### 0.0.2 +* cleaned up repo +* added more instructions in README + +### 0.0.1 +* initial release + diff --git a/Rpairix.Rproj b/Rpairix.Rproj index d848a9f..aa7881a 100644 --- a/Rpairix.Rproj +++ b/Rpairix.Rproj @@ -1,4 +1,4 @@ -Version: 1.0 +Version: 0.0.4 RestoreWorkspace: No SaveWorkspace: No diff --git a/inst/test_4dn.pairs.gz b/inst/test_4dn.pairs.gz new file mode 100644 index 0000000..01041da Binary files /dev/null and b/inst/test_4dn.pairs.gz differ diff --git a/inst/test_4dn.pairs.gz.px2 b/inst/test_4dn.pairs.gz.px2 new file mode 100644 index 0000000..e2b0414 Binary files /dev/null and b/inst/test_4dn.pairs.gz.px2 differ diff --git a/man/px_exists.Rd b/man/px_exists.Rd new file mode 100644 index 0000000..84e4b88 --- /dev/null +++ b/man/px_exists.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/px_exists.R +\name{px_exists} +\alias{px_exists} +\title{Check function on pairix-indexed pairs file.} +\usage{ +px_exists(filename, key) +} +\arguments{ +\item{filename}{a pairs file, or a bgzipped text file (sometextfile.gz) with an index file sometextfile.gz.px2 in the same folder.} + +\item{key}{a pair of chromosomes in the query string format (e.g. "chr1|chr2"), or a chromosome for a 1D-indexed pairs file (e.g. "chr1").} +} +\description{ +This function allows you to check if a key (chr for 1D, chr pair for 2D) exists in a pairs file. +} +\examples{ +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +key = "chrX|chrX" +res = px_exists(filename, key) +print(res) + +filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") +key = "10|20" +res = px_exists(filename, key) +print(res) + +filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", +package="Rpairix") +key = "10|20" +res = px_exists(filename, key) +print(res) +} +\keyword{check} +\keyword{pairix} + diff --git a/man/px_keylist.Rd b/man/px_keylist.Rd index 7d5221f..73d02f7 100644 --- a/man/px_keylist.Rd +++ b/man/px_keylist.Rd @@ -13,12 +13,19 @@ px_keylist(filename) This function allows you to get the list of keys (chromosome pairs) in a pairix-indexed pairs file. } \examples{ +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +res = px_keylist(filename) +print(res) + filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") res = px_keylist(filename) +print(res) + filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", package="Rpairix") - res = px_keylist(filename) +print(res) + } \keyword{2D} \keyword{pairix} diff --git a/man/px_query.Rd b/man/px_query.Rd index a7358c4..dd08f77 100644 --- a/man/px_query.Rd +++ b/man/px_query.Rd @@ -19,14 +19,22 @@ px_query(filename, querystr, max_mem = 1e+08, stringsAsFactors = FALSE) This function allows you to query a 2D range in a pairix-indexed pairs file. } \examples{ -filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +querystr = "chrX|chrX" +res = px_query(filename, querystr) +print(res) + +filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz", package="Rpairix") querystr = "10:1-1000000|20" -res = px_query(filename,querystr) +res = px_query(filename, querystr) +print(res) filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", package="Rpairix") querystr = "10:1-1000000|20" -res = px_query(filename,querystr) +res = px_query(filename, querystr) +print(res) + } \keyword{2D} \keyword{pairix} diff --git a/man/px_seq1list.Rd b/man/px_seq1list.Rd index 10e8520..6fc8e0e 100644 --- a/man/px_seq1list.Rd +++ b/man/px_seq1list.Rd @@ -13,8 +13,13 @@ px_seq1list(filename) This function allows you to get the list of first chromosomes in a pairix-indexed pairs file. } \examples{ +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +res = px_seq1list(filename) +print(res) + filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") res = px_seq1list(filename) +print(res) } \keyword{2D} \keyword{pairix} diff --git a/man/px_seq2list.Rd b/man/px_seq2list.Rd index fe59afc..8c60888 100644 --- a/man/px_seq2list.Rd +++ b/man/px_seq2list.Rd @@ -13,8 +13,13 @@ px_seq2list(filename) This function allows you to get the list of second chromosomes in a pairix-indexed pairs file. } \examples{ +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +res = px_seq2list(filename) +print(res) + filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") res = px_seq2list(filename) +print(res) } \keyword{2D} \keyword{pairix} diff --git a/man/px_seqlist.Rd b/man/px_seqlist.Rd index 019b3dc..8de9585 100644 --- a/man/px_seqlist.Rd +++ b/man/px_seqlist.Rd @@ -13,8 +13,13 @@ px_seqlist(filename) This function allows you to get the list of chromosomes in a pairix-indexed pairs file. } \examples{ +filename = system.file(".","test_4dn.pairs.gz", package="Rpairix") +res = px_seqlist(filename) +print(res) + filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix") res = px_seqlist(filename) +print(res) } \keyword{2D} \keyword{pairix} diff --git a/src/Rpairix.so b/src/Rpairix.so deleted file mode 100755 index e8ec769..0000000 Binary files a/src/Rpairix.so and /dev/null differ diff --git a/src/index.c b/src/index.c index ddea208..0f2560b 100644 --- a/src/index.c +++ b/src/index.c @@ -16,6 +16,8 @@ #define TAD_LIDX_SHIFT 14 #define REGION_SPLIT_CHARACTER '|' #define DEFAULT_DELIMITER '\t' +#define MAX_REGION_STR_LEN 10000 + typedef struct { uint64_t u, v; @@ -801,7 +803,7 @@ int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin } else *end = 1<<29; if (*begin > 0) --*begin; free(s); - if (*begin > *end) return -1; + if (*begin > *end) return -2; return 0; } @@ -829,8 +831,8 @@ int ti_parse_region2d(const ti_index_t *idx, const char *str, int *tid, int *beg if(i == k) { //1d query *begin2=-1; *end2=-1; - if(ti_parse_region(idx,str,tid,begin,end)==0) { free(s); return 0; } - else {free(s); return -1; } + int res = ti_parse_region(idx,str,tid,begin,end); + free(s); return (res); }else{ //2d query coord1s=0; coord1e=i; coord2s=i+1; coord2e=k; @@ -1074,6 +1076,16 @@ int ti_fetch_2d(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, int const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; } +/* Get 0-based column index and delimiter */ +int ti_get_sc(ti_index_t *idx) { return idx? idx->conf.sc-1 : -1; } +int ti_get_sc2(ti_index_t *idx) { return idx? idx->conf.sc2-1 : -1; } +int ti_get_bc(ti_index_t *idx) { return idx? idx->conf.bc-1 : -1; } +int ti_get_bc2(ti_index_t *idx) { return idx? idx->conf.bc2-1 : -1; } +int ti_get_ec(ti_index_t *idx) { return idx? idx->conf.ec-1 : -1; } +int ti_get_ec2(ti_index_t *idx) { return idx? idx->conf.ec2-1 : -1; } +char ti_get_delimiter(ti_index_t *idx) { return idx? idx->conf.delimiter : 0; } + + /******************* * High-level APIs * *******************/ @@ -1116,7 +1128,7 @@ sequential_iter_t *ti_querys_2d_general(pairix_t *t, const char *reg) { int n_seqpair_list; int n_sub_list; - char *sp, *chr1, *chr2, **chr1list, **chr1pairlist, **chrpairlist; + char *sp, *chr1, *chr2, **chr1list, **chr2list, **chrpairlist; char *chrend; char chronly=1; int i; @@ -1131,10 +1143,8 @@ sequential_iter_t *ti_querys_2d_general(pairix_t *t, const char *reg) *chrend=0; chronly=0; } char **chrpairlist = ti_seqname(t->idx, &n_seqpair_list); - fprintf(stderr,"chr2 = %s, n_seqpair_list = %d\n", chr2, n_seqpair_list); //debugging chr1list = get_seq1_list_for_given_seq2(chr2, chrpairlist, n_seqpair_list, &n_sub_list); if(chronly==0) *chrend=':'; // revert to original region string including beg and end - fprintf(stderr, "chr2 = %s, n_sub_list = %d\n", chr2, n_sub_list); // debugging // create an array of regions in string. char **regions = malloc(n_sub_list * sizeof(char*)); for(i=0;iidx, &n_seqpair_list); - chr1pairlist = get_seq2_list_for_given_seq1(chr1, chrpairlist, n_seqpair_list, &n_sub_list); - if(chronly==0) *chrend=':'; - fprintf(stderr,"%s\n", chr1pairlist[0]); - sequential_iter_t *siter = ti_querys_2d_multi(t, chr1pairlist, n_sub_list); + chr2list = get_seq2_list_for_given_seq1(chr1, chrpairlist, n_seqpair_list, &n_sub_list); + if(chronly==0) *chrend=':'; // revert to original region string including beg and end + + // create an array of regions in string. + char **regions = malloc(n_sub_list * sizeof(char*)); + for(i=0;iidx, tid, beg, end, beg2, end2); } +sequential_iter_t *ti_queryi_2d_general(pairix_t *t, int tid, int beg, int end, int beg2, int end2) +{ + sequential_iter_t *siter = create_sequential_iter(t); + int i; + add_to_sequential_iter ( siter, ti_queryi_2d(t,tid,beg,end,beg2,end2) ); + return(siter); +} + ti_iter_t ti_querys(pairix_t *t, const char *reg) { return ti_querys_2d(t,reg); } +sequential_iter_t *ti_querys_general(pairix_t *t, const char *reg) +{ + sequential_iter_t *siter = create_sequential_iter(t); + int i; + add_to_sequential_iter ( siter, ti_querys(t, reg) ); + return(siter); +} + + ti_iter_t ti_querys_2d(pairix_t *t, const char *reg) { int tid, beg, end, beg2, end2; @@ -1216,6 +1253,16 @@ ti_iter_t ti_querys_2d(pairix_t *t, const char *reg) return ti_iter_query(t->idx, tid, beg, end, beg2, end2); } +sequential_iter_t *ti_querys_2d_multi(pairix_t *t, const char **regs, int nRegs) +{ + sequential_iter_t *siter = create_sequential_iter(t); + int i; + for(i=0;iidx, tid, beg, end, -1, -1); } +sequential_iter_t *ti_query_general(pairix_t *t, const char *name, int beg, int end) +{ + sequential_iter_t *siter = create_sequential_iter(t); + int i; + add_to_sequential_iter (siter, ti_query(t, name, beg, end)); + return(siter); +} + ti_iter_t ti_query_2d(pairix_t *t, const char *name, int beg, int end, const char *name2, int beg2, int end2) { int tid; @@ -1243,6 +1298,14 @@ ti_iter_t ti_query_2d(pairix_t *t, const char *name, int beg, int end, const cha return ti_iter_query(t->idx, tid, beg, end, beg2, end2); } +sequential_iter_t *ti_query_2d_general(pairix_t *t, const char *name, int beg, int end, const char *name2, int beg2, int end2) +{ + sequential_iter_t *siter = create_sequential_iter(t); + int i; + add_to_sequential_iter (siter, ti_query_2d(t, name, beg, end, name2, beg2, end2)); + return(siter); +} + int ti_querys_tid(pairix_t *t, const char *reg) { return ti_querys_2d_tid(t,reg); @@ -1260,11 +1323,13 @@ int ti_querys_2d_tid(pairix_t *t, const char *reg) int ti_query_tid(pairix_t *t, const char *name, int beg, int end) { - int tid; + int tid, parse_err; if (name == 0) return -3 ; // then need to load the index if (ti_lazy_index_load(t) != 0) return -3; - return( ti_get_tid(t->idx, name) ); + if (ti_get_tid(t->idx, name) <0) return -1; + if (beg > end) return -2; + else return 0; } int ti_query_2d_tid(pairix_t *t, const char *name, int beg, int end, const char *name2, int beg2, int end2) @@ -1280,7 +1345,10 @@ int ti_query_2d_tid(pairix_t *t, const char *name, int beg, int end, const char if (name == 0) return ti_iter_first(); // then need to load the index if (ti_lazy_index_load(t) != 0) return 0; - return (ti_get_tid(t->idx, namepair)); + if(ti_get_tid(t->idx, namepair) <0) return -1; + if (beg > end) return -2; + if (beg2 > end2) return -2; + else return 0; } @@ -1305,9 +1373,11 @@ sequential_iter_t *create_sequential_iter(pairix_t *t) void destroy_sequential_iter(sequential_iter_t *siter) { int i; - for(i=0;in;i++) ti_iter_destroy(siter->iter[i]); - free(siter->iter); - free(siter); + if(siter){ + for(i=0;in;i++) ti_iter_destroy(siter->iter[i]); + free(siter->iter); + free(siter); + } } // add an iter element to a sequential iterator, the size is dynamically incremented. @@ -1425,7 +1495,7 @@ const char *merged_ti_read(merged_iter_t *miter, int *len) const char *sequential_ti_read(sequential_iter_t *siter, int *len) { - if(!siter) { fprintf(stderr,"Null merged_iter_t\n"); return(NULL); } + if(!siter) { fprintf(stderr,"Null sequential_iter_t\n"); return(NULL); } if(siter->n<=0) { fprintf(stderr,"No iter_unit lement in merged_iter_t\n"); return(NULL); } char *s = ti_iter_read(siter->t->fp,siter->iter[siter->curr], len, 0); @@ -1591,12 +1661,27 @@ char **get_seq1_list_for_given_seq2(char *seq2, char **seqpair_list, int n_seqpa k++; } } - fprintf(stderr,"k=%d, *pn_sub_list=%d\n",k,*pn_sub_list); // debugging assert (k = *pn_sub_list); return(sublist); } +/* convert string 'region1|region2' to 'region2|region1' */ +char *flip_region ( char* s) { + char s_flp[MAX_REGION_STR_LEN]; + int l, i, l2, split_pos; + l = strlen(s); + for(i = 0; i != l; i++) if( s[i] == REGION_SPLIT_CHARACTER) break; + s[i]=0; + split_pos = i; + l2 = l-1-i; + strcpy(s_flp, s + i + 1); + s_flp[l2] = REGION_SPLIT_CHARACTER; + strcpy(s_flp + l2 + 1, s); + s[i]= REGION_SPLIT_CHARACTER; + return(s_flp); +} + // given a chromosome for mate1 (seq1='chr1') return the array containing all seqpairs matching seq1 ('chr1|chr1', 'chr1|chr2', ... ) // the returned subarray contains pointers to the original seqpair_list elements. char **get_sub_seq_list_for_given_seq1(char *seq1, char **seqpair_list, int n_seqpair_list, int *pn_sub_list) @@ -1738,90 +1823,3 @@ char **uniq(char** seq_list, int n_seq_list, int *pn_uniq_seq) -// pairs merger - merge multiple 2D-sorted files into a merged, 2D-sorted stream -//pass null - bgzf_write is slower than piping to bgzip -c. -int pairs_merger(char **fn, int n, BGZF *bzfp) // pass bgfp if the result should be bgzipped. or pass NULL. -{ - pairix_t *tbs[n]; - int i,j; - int reslen; - int n_uniq_seq=0; - char **uniq_seq_list=NULL; - char *s=NULL; - merged_iter_t *miter=NULL; - ti_iter_t iter; - - // opening files and creating an array of pairix_t struct and prepare a concatenated seqname array - fprintf(stderr,"Opening files...\n"); - for(i=0;iiu[j]); - } - while ( s=merged_ti_read(miter,&reslen) ) puts(s); - //while ( s=merged_ti_read(miter,&reslen) ) if (bgzf_write(bzfp, s, reslen) < 0) fail(bzfp); - destroy_merged_iter(miter); miter=NULL; - } - for(i=0;iUp converter - convert a single 2D-sorted file into a 1D-sorted stream. -int stream_1d(char *fn) -{ - pairix_t *tb, **tbs_copies=NULL; - int n_chrpairs, n_chr1, n_chr1pairs; - char **chrpair_list, **chr1_list, **chr1pair_list; - int i,j; - char *s=NULL; - merged_iter_t *miter=NULL; - ti_iter_t iter; - int reslen; - - tb = load_from_file(fn); - if(tb==NULL) { fprintf(stderr,"file load failed\n"); return(1); } - chrpair_list = ti_seqname(tb->idx, &n_chrpairs); - if(chrpair_list==NULL) { fprintf(stderr, "Cannot retrieve key list\n"); return(1); } - chr1_list = get_seq1_list_from_seqpair_list(chrpair_list, n_chrpairs, &n_chr1); // 'chr1','chr2',... - if(chr1_list==NULL) { fprintf(stderr, "Cannot retrieve list of first chromosomes\n"); return(1); } - - for(i=0;iiu[j]); - } - while ( s=merged_ti_read(miter,&reslen) ) puts(s); - destroy_merged_iter(miter); miter=NULL; - for(j=0;jidx = ti_index_load(fn); - else printf("File doesn't exist.\n"); return(tb); } @@ -100,4 +99,12 @@ void get_keylist(char** pfn, char** pkeylist, int* pflag){ else *pflag = -1; // error } +//check if a key (chr for 1D, chr pair for 2D) exists +void key_exists(char** pfn, char** pkey, int* pflag){ + pairix_t *tb = load(*pfn); + if(tb){ + *pflag = ti_get_tid(tb->idx, *pkey)!=-1?1:0; + } + else *pflag= -1; +}