#ifndef PATHWORKS_H #define PATHWORKS_H 1 #endif // use radix sort instead of qsort because it is faster #define RADIX_SORT 1 #define CALC_OPTION_FE 0 #define CALC_OPTION_GPCC 1 #define CALC_OPTION_PERMUTE 2 #define CALC_OPTION_PERMUTE3 3 #define CALC_OPTION_GPCC2 4 #define CALC_OPTION_GPCC3 5 #define CALC_OPTION_GPCC4 6 #define PERMUTE_DEFAULT_LOW 5000 #define MAXBSIDPOSSIBLE 851568 // for pwharvest #define MAXGENE 70000 // last count 61141 pathworksgenes.txt #define MAX_INGENES 40000 // types #define type_functional_set 1 #define type_pathway 2 #define type_structural_complex 3 #define type_custom 4 #define type_unknown 9 // scope #define conserved_biosystem_scope 10 #define organism_specific_biosystem_scope 11 #define CAT_NCBI_BIOCYC (1<<0) // count = 294 #define CAT_NCBI_GO (1<<1) // count = 13515 #define CAT_NCBI_KEGG (1<<2) // count = 485 #define CAT_NCBI_PANTH (1<<3) // count = 129 #define CAT_NCBI_PID (1<<4) // count = 183 #define CAT_NCBI_REACTOME (1<<5) // count = 1548 #define CAT_NCBI_WikiPathways (1<<6) // count = 345 #define CAT_MSIG_C1 (1<<7) // count = 325 #define CAT_MSIG_C2 (1<<8) // count = 3777 #define CAT_MSIG_C3 (1<<9) // count = 836 #define CAT_MSIG_C4 (1<<10) // count = 858 #define CAT_MSIG_C5 (1<<11) // count = 5871 #define CAT_MSIG_C6 (1<<12) // count = 144 #define CAT_MSIG_C7 (1<<13) // count = 1888 #define CAT_MSIG_C8 (1<<14) #define CAT_MSIG_H (1<<15) // count = 50 #define CAT_CUSTOM (1<<16) #if 0 #define CAT_MSIG_ARCHIVED (1<<15) // count = 858 #endif #define NCBI_PAT = (CAT_NCBI_BIOCYC|CAT_NCBI_GO|CAT_NCBI_KEGG|CAT_NCBI_PANTH|CAT_NCBI_PID|CAT_NCBI_REACTOME|CAT_NCBI_WikiPathways); #define MSIG_PAT = (CAT_MSIG_C1|CAT_MSIG_C2|CAT_MSIG_C3|CAT_MSIG_C4|CAT_MSIG_C5|CAT_MSIG_C6|CAT_MSIG_C7|CAT_MSIG_C8|CAT_MSIG_H); struct binpathouttype // the "binary" pathway information file { int bsid; // 32 bit integer. note: originally "bs" was for "biosystems" int category; // bit patern for each category (particular may be bit set to turn on) examples:CAT_NCBI_GO int accession; // spill from char * this is an "offset from spill space start" points to int name; // spill for char * . int type; // 1 byte, from char * int scope; // 32 bit int from char * int taxid; // 32 bit int , taxonomyid int desc; // spill from char * "points" (really offset) into spill space unsigned int numgenes; // little tricky here: "hits" is not set at record creation (it is set to null), then, later (i.e when running l2p) , // it is used in processing when the binpath[] data is read in. use this "hits" field for the count of // genes that hit this pathway int offset2geneids; // pointer to "numgenes" geneids // ***** NOTE: Different C compilers produce different sized records for this structure (binpathouttype). // ***** The output file for this will only contain the important to save fields // ***** we only need to writeout the above 10 fields. So output record size is 10*4=40 bytes. #if 0 // hits is a ptr to an array with // firstelemen=[0]=numhits, then the rest of the array is [1...n] ptrs to struct of generecs (bingentype? right?) void *hits; // used in l2p for user "hits" to this pathway #endif }; #define MAXGENENAME 26 // maximum length ARHGAP27P1-BPTFP1-KPNA2P3 = 25 struct bingenetype { int geneid; // entrez gene id char hugo[MAXGENENAME]; char ensembl[MAXGENENAME]; int pathcount; // count of paths, ids are in int array famous at "pathplace" int pathplace; // index to path (to a struct binpathouttype record,see above) int categories; // bit patterns }; struct updated_genes_type { char *newname; char *oldname; int change_flag; int status; int is_legit_name; }; struct genelisttype // used by harvest programs { int geneid; struct genelisttype *n; }; struct raw_genelisttype // used by harvest programs { char *raw; // raw gene name struct raw_genelisttype *n; }; struct bstype // biosystems id and info - input into this array -- used by harvest programs { int bsid; int category; // use CAT_ bitpattern defines (above) char *accession; char *name; char *type; char *scope; int taxid; char *desc; int redundant; // flag for checking to see if this pathway is duplicated by another pathway // next two fields get values from other file int numgenes; // "count of" in next line of code line (i.e. number of genes) struct genelisttype *geneslinkedlist; // a linked list of FINAL genes struct raw_genelisttype *raw_genes_linkedlist; // a linked list of raw genes }; struct hugo_type { char *hugo; struct bingenetype *generec_ptr; int status; // this can be used for various purposes, initial reason is to use for "universe" masking }; struct genetype // from ncbi { // this is (may) only used in pwharvest, l2p uses bingenetype int geneid; char *hugo; char *ensembl; int categories; }; struct hit_type { unsigned int hitcnt; unsigned int maxhits; unsigned int *hitsindexes; }; // pathway commons #define chemical_affects (1<<0) #define in_complex_with (1<<1) #define catalysis_precedes (1<<2) #define controls_expression_of (1<<3) #define controls_state_change_of (1<<4) #define controls_production_of (1<<5) #define consumption_controlled_by (1<<6) #define controls_phosphorylation_of (1<<7) #define used_to_produce (1<<8) #define transport (1<<9) #define reacts_with (1<<10) #define interacts_with (1<<11) #define reference (1<<12) #define multiple (1<<13) #define other (1<<14) #define ABdirection (1<<15) #define MAXPC 2000000 // latest 1915769 PathwayCommons12.All.hgnc.txt struct pctype // pathway commons type { int ID_Interactor_A; int ID_Interactor_B; char *hugo1; char *hugo2; unsigned short int interaction_type; int is_dupe; }; #define MAXBIOGRID 303568 // bits for "interaction_type" field ... // count name #define association 1 // 8931 psi-mi:"MI:0914(association)" #define colocalization 2 // 44101 psi-mi:"MI:0403(colocalization)" #define synthetic_genetic_interaction_defined_by_inequality 4 // 50045 psi-mi:"MI:0794(synthetic genetic interaction defined by inequality)" #define suppressive_genetic_interaction_defined_by_inequality 8 // 197811 psi-mi:"MI:0796(suppressive genetic interaction defined by inequality)" #define direct_interaction 16 // 206875 psi-mi:"MI:0407(direct interaction)" #define physical_association 32 // 329721 psi-mi:"MI:0915(physical association)" #define additive_genetic_interaction_defined_by_inequality 64 // 535593 psi-mi:"MI:0799(additive genetic interaction defined by inequality)" struct biogridtype { int ID_Interactor_A; int ID_Interactor_B; int interaction_type; }; struct smallgenetype { char *hugo; // hugo = human gene name nomenclature authority ("official gene name") unsigned int egid; // entrez gene id }; struct used_path_type { unsigned int category; char *custom_category_name; char *acc; char *name; unsigned int numgenes; // original number of genes in pathway unsigned int numfixedgenes; // after fixing unsigned int *egids; unsigned int hitcnt; unsigned int *genehits; // put hits here. reason: need to print them out unsigned int aughitcnt; // not used . fix double pathhits_gpsum; // # of pathways by each hit gene in pathway unsigned int pathcountsum; // # of pathways for each gene in pathway double OR; double gpcc_OR; double pval; double pval2; // alt double permute_pval; // permute double gpcc_p; double fdr; double gpcc_fdr; double enrichment_score; // ratio unsigned int pwgenesindex; // orginal george int a,b,c,d; // a=universe-userinput-pwgenes-list b=pw-hits, c=degs-hits , d = number of hits unsigned int a,b,c,d; // a=universe-userinput-pwgenes-list b=pw-hits, c=degs-hits , d = number of hits unsigned int A_scaled,B_scaled,C_scaled,D_scaled; // #if NELSON_C #if 1 unsigned int randhits; unsigned int countover; // data hits value > permutation p hits unsigned int countequal; unsigned int countunder; // redundant double p_permute_over; double p_permute_under; // redundant #endif double pval4; double fdr4; }; struct tree_with_count { unsigned int val; // entrez gene id : sometimes called "egid" unsigned int count; // number of pathways this gene hits unsigned int deg; // 1 on deglist, 0 not on ( deglist = "differentially expressed gene list" , aka user inlist) struct tree_with_count *left; struct tree_with_count *right; struct used_path_type **all_gene_paths; // all gene paths is an array of pointers ( of "count" size). unsigned int pathindex; // which array member gets the pointer to pathway? }; struct custom_type { char *name; char *optional; // should in practice be the accession ? unsigned int numgenes; unsigned int *genes; }; struct ens2gene_type { char *ens; char *symbol; }; struct a2a_type { int taxid1; int taxid2; int ensidx1; int ensidx2; }; struct synonym_type { char *Synonym; int GeneID; char *Symbol; int status; }; struct entrez_hugo_ensemble_type { unsigned int gene_id; // note case of value is zero char *hugo; char *ens; }; void category_set_all(unsigned int *pat); void category_code_to_string(unsigned int cat,char puthere[]); int string_to_category_code(char cats[]); void categories_pattern_to_strings(unsigned int cat,char puthere[]); double exact22(int n11_,int n12_,int n21_,int n22_); // fishers exact double exact22_oneside(int n11_,int n12_,int n21_,int n22_, int dbg); unsigned int string2type(char *s); int bitCount(int n); int setup_by_egids(void); char *egid2hugo(int egid); unsigned int hugo2egid(char *h); char *type2string(int type); int cmp_ui(const void *a, const void *b); unsigned int *get_used_universe(struct used_path_type *u, unsigned int num_used, unsigned int *real_universe_cnt); int cmp_ordertype_by_val_REV(const void *a, const void *b); int cmp_usi(const void *a, const void *b); int do_pvals_and_bh(unsigned int ingenecnt, struct used_path_type usedpaths[], unsigned int numusedpaths,unsigned int real_universe_cnt, int oneside); unsigned int GPCC(struct used_path_type usedpaths[], unsigned int num_used_paths, unsigned int real_universe_cnt, unsigned int *real_universe); int do_just_bh(unsigned int ingenecnt, struct used_path_type usedpaths[], unsigned int num_used_paths,unsigned int real_universe_cnt); // void malloc_pathpointers(struct tree_with_count *node); // counts aligned with universe (real_universe) void radix_ui(register unsigned int vector[], register const unsigned int size) ; int l2pfunc(struct used_path_type *usedpaths,unsigned int num_used_paths,unsigned int real_universe_cnt, unsigned int *real_universe, int calc_option, int *user_incnt_ptr, int oneside, unsigned int numpermutes); struct updated_genes_type *updategenesR(char *genes[], const int len); struct entrez_hugo_ensemble_type *egids2hugos(unsigned int egids[], const int len); struct used_path_type *setup_used_paths(unsigned int *num_used_paths, unsigned int catspat, char universe_file[], unsigned int in_universe_cnt,unsigned int *in_universe, char custom_file[], unsigned int gmtfld2, unsigned int *real_universe_cnt_ptr,unsigned int **real_universe,unsigned int lencust,struct custom_type *mycustompw); void bh_adjusted(const double *p, double *pa, int size) ; double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);