Skip to content

Commit

Permalink
GeneSCF source code
Browse files Browse the repository at this point in the history
  • Loading branch information
decodebiology committed Apr 22, 2020
1 parent 8323c27 commit 5fcaa26
Show file tree
Hide file tree
Showing 93 changed files with 98,698 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added GeneSCF-Documentation_v1.1.pdf
Binary file not shown.
39 changes: 39 additions & 0 deletions Instructions_for_geneSCF_batch.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@


GeneSCF v1.1 patch release - Instructions to run GeneSCF batch jobs
=========================================================================

Files added (patch scripts)
---------------------------

1) geneSCF_batch
2) db_batch_config.txt
3) Instructions_for_geneSCF_batch.txt


What are these patch scripts for?
---------------------------------

These patch files can be used to run GeneSCF for multiple gene lists simultaneously (batch jobs).

Example, if you want to do enrichment analysis for 100 gene list without writing command line for individual submission, these batch files can be used to submit all at once by simply specifying the folder where gene lists are stored and path to output folder.

files_path="/FOLDER/WHERE/GENE_LISTS/STORED"
output_path="/FOLDER/PATH/FOR/OUTPUT"

Note:
- Recommended to keep all input files in same folder.
- Inside specified output folder path GeneSCF will automatically create individual sub-folders for each gene list.


How to use these patch files?
-----------------------------

1) Edit script './geneSCF-master-source-v1.1-px/geneSCF_batch' for your input files (files_path) and output path (output_path).

files_path="/FOLDER/WHERE/GENE_LISTS/STORED"
output_path="/FOLDER/PATH/FOR/OUTPUT"

2) Edit file './geneSCF-master-source-v1.1-px/db_batch_config.txt' to configure your parameters for batch run.

3) Execute [genescf_path]/geneSCF-master-source-v1.1-px/geneSCF_batch.
Binary file added class/.DS_Store
Binary file not shown.
283 changes: 283 additions & 0 deletions class/functional_class3.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
#!/usr/bin/perl

$/=undef;
use Cwd;
$pwd=cwd();
use lib "$ARGV[5]/class/lib";
use lib "$ARGV[5]/class//lib/Tie-IxHash-1.23";
use Tie::IxHash;
use Statistics::Multtest qw(bonferroni holm hommel hochberg BH BY qvalue);
use Statistics::Multtest qw(:all);
tie %kegg, 'Tie::IxHash';
use Text::NSP::Measures::2D::Fisher::right;
use Number::FormatEng qw(:all);
use Data::Dumper;

$GTYPE = $ARGV[0];
$INFILE = $ARGV[1];
$OUTPATH = $ARGV[2];
$database = $ARGV[3];
$background = $ARGV[4];
$DIR = $ARGV[5];
$organism = $ARGV[6];
$plot = $ARGV[7];

$start_run = time();




@myout=split("/",$INFILE);
open(RESULT,">$OUTPATH/@{myout[$#myout]}_${database}_${organism}_functional_classification.tsv");


$genescf_note="Note:Only KEGG and Geneontology supports multiple organisms (GeneSCF-xx/org_codes_help). If you choose REACTOME/NCG database please specify organism as 'Hs'. Currently REACTOME and NCG in GeneSCF only supports Human (Hs).";

#### KEGG DB ####



if($GTYPE eq "gid" && $database eq "KEGG")
{
$mytype="Entrez GeneID";

$kup=`find $DIR/class/lib/db/${organism}/kegg_database.txt -maxdepth 0 -printf "%TY-%Tm-%Td %TH:%TM\n" | head -n1`;
$kup=~ tr/\n//d;

print "$genescf_note";
print "\n$database last updated $kup\n";

print "Example input types\n---------------\ngid |\tsym\n";
$gene_type=`curl -s http://rest.kegg.jp/list/${organism} | grep -v "uncharacterized"| sort -R | head -n10 | sed 's/^${organism}\://' | sed 's/\; /\t/' | cut -f1,2`; ## Edited 2017/07/07
print $gene_type;

$ginput=$INFILE;
open(IN1,"$DIR/class/lib/db/${organism}/kegg_database.txt") or die "Error opening in database file";

}
if($GTYPE eq "sym" && $database eq "KEGG")
{
$mytype="Gene Symbol";

$kup=`find $DIR/class/lib/db/${organism}/kegg_database.txt -maxdepth 0 -printf "%TY-%Tm-%Td %TH:%TM\n" | head -n1`;
$kup=~ tr/\n//d;

print "$genescf_note";
print "\n$database last updated $kup\n";

print "Example input types\n---------------\ngid |\tsym\n";
#$gene_type=`curl -s http://rest.kegg.jp/list/${organism} | grep -v "uncharacterized"| head -n10 | sed 's/^${organism}\://' | sed 's/\;/\t/' | awk '{print \$1"\t"\$2;}'`;
$gene_type=`curl -s http://rest.kegg.jp/list/${organism} | grep -v "uncharacterized"| sort -R | head -n10 | sed 's/^${organism}\://' | sed 's/\; /\t/' | cut -f1,2`; ## Added 2017/01/13 # Edited 2017/07/07
print $gene_type;

print "=> Retreving gene list for ${organism} from $database\n";
#$cmd=`curl -s -S http://rest.kegg.jp/list/${organism} | sed "s/${organism}\://" > $DIR/mapping/DB/@{myout[$#myout]}_gene_list.txt;`; ## Edited 2010/07/24

$cmd=`curl -s -S http://rest.kegg.jp/list/${organism} | sed "s/${organism}\://" | sed 's/; /\t/' | cut -f1,2> $DIR/mapping/DB/@{myout[$#myout]}_gene_list.txt;`; ## Added 2017/06/01 # Edited 2017/07/07

print "=> Mapping user list\n";
$cmd=`perl $DIR/class/scripts/mappingIDS.pl $DIR/mapping/DB/@{myout[$#myout]}_gene_list.txt $INFILE $DIR | awk '!x[\$0]++' > $OUTPATH/@{myout[$#myout]}_user_mapped.list`; ## Edited 2010/07/24
$cmd=(`rm -rf $DIR/mapping/DB/@{myout[$#myout]}_gene_list.txt || true`); ## Edited and addded 2010/07/24
$ulist=`wc -l $INFILE | cut -d' ' -f1`;

$cmd=`cat $OUTPATH/@{myout[$#myout]}_user_mapped.list | cut -f1 | awk '!x[\$0]++' > $DIR/mapping/@{myout[$#myout]}_input_list.txt`;
#$mlist=`wc -l $DIR/mapping/@{myout[$#myout]}_input_list.txt | cut -d' ' -f1`;
$mlist=`cat $OUTPATH/@{myout[$#myout]}_user_mapped.list | cut -f2 | awk '!x[\$0]++' | wc -l`; ## Edited and Added 2017/01/13

$mlist=~ tr/\n//d;

$ulist=~ tr/\n//d;

$plist=($mlist/$ulist)*100;

if($mlist>0)
{
print "Note: Since you used Gene Symbol as identifier, the accuracy will be less, for better results use Entrez UIDs.\n There were " . $mlist . " genes mapped from " . $ulist . " user provided unique genes ($plist %)\n";
$ginput="$DIR/mapping/@{myout[$#myout]}_input_list.txt";
open(IN1,"$DIR/class/lib/db/${organism}/kegg_database.txt") or die "Error opening in database file";
}
if($mlist<1)
{

print "Note: There were " . $mlist . " genes mapped from " . $ulist . " user provided unique genes ($plist %)\nPlease cross-check your gene identifier.";
exit;
}
}




#### GeneOntology, REACTOME, NCG DB ####


if($database eq "GO_BP" || $database eq "GO_CC" || $database eq "GO_MF" || $database eq "GO_all" || $database eq "REACTOME" || $database eq "NCG")
{
if($GTYPE eq "gid")
{
$mytype="Entrez GeneID";
}
if($GTYPE eq "gid")
{
$mytype="Entrez GeneID";
}
if($GTYPE eq "sym")
{
$mytype="Gene Symbol";
}

$goup=`find $DIR/class/lib/db/${organism}/${database}_${GTYPE}.txt -maxdepth 0 -printf "%TY-%Tm-%Td %TH:%TM\n" | head -n1`;
$goup=~ tr/\n//d;

print "$genescf_note";
print "\n$database last updated $goup\n";


$ginput=$INFILE;
open(IN1,"$DIR/class/lib/db/${organism}/${database}_${GTYPE}.txt") or die "Error opening in database file";


}



###### DB end #######



open(IN2,$ginput) or print "\n***\nError opening input file: $INFILE\n***\n\n";


print RESULT "Genes\tProcess~name\tnum_of_Genes\tgene_group\tpercentage%\tP-value\tBenjamini and Hochberg (FDR)\t Hommel singlewise process\tBonferroni single-step process\tHommel singlewise process\tHochberg step-up process\tBenjamini and Yekutieli\n";

while(<IN1>){
chomp;
@temp1=split("\n",$_);
foreach $temp1(@temp1)
{
@ar1 = split("\t",$temp1);
$kegg{$ar1[0]} = ["$ar1[1]"];
}
}



while(<IN2>){
@temp2=split("\n",$_);
foreach $temp2(@temp2)
{
@ar2 = split("\t",$temp2);
$bp{$ar2[0]} = $ar2[0];
}
}
$gene_list=@temp2;

@mykeys=();
foreach my $keykg ( keys %kegg )
{

@kgene=split(",",$kegg{$keykg}[0]);
$kcount=1;
$gcount=1;
$gnum=0;
@gset=();
foreach $kgene(@kgene)
{$knum=$kcount++;
if(exists $bp{$kgene})
{
$gnum=$gcount++;
$indgene=$bp{$kgene}.";";
push(@gset,$indgene);

}


}


if($gnum>0)
{
$x=$gnum;
$n=$gene_list;
$M=$knum; ## total genes in process
$N=$background;




$fisher_value = calculateStatistic( n11=>$x,n1p=>$n,np1=>$x+$M,npp=>$N+$n);


push(@new,$fisher_value);


$percent=(($gnum/$knum)*100);
push(@finres,"@gset\t$keykg\t$gnum\t$knum\t$percent\t$fisher_value\t");

}


}

$p=\@new;
$bhres = BH($p);
$holmres = holm($p);
$bfres=bonferroni($p);
$hommel=hommel($p);
$hochberg=hochberg($p);
$byres=BY($p);


for($i=0;$i<=$#finres;$i++)
{
print RESULT $finres[$i];
print RESULT @$bhres[$i]."\t".@$holmres[$i]."\t".@$bfres[$i]."\t".@$hommel[$i]."\t".@$hochberg[$i]."\t".@$byres[$i]."\n";

}




close(RESULT);
if($plot eq "yes")
{
open(TEMP,">$DIR/mapping/@{myout[$#myout]}_${database}_${organism}_functional_classification_temp.tsv");
open(IN3,"$OUTPATH/@{myout[$#myout]}_${database}_${organism}_functional_classification.tsv");

while(<IN3>){
chomp;
@temp1=split("\n",$_);
foreach $temp1(@temp1)
{
@dat = split("\t",$temp1);


print TEMP "$dat[1]\t$dat[4]\t$dat[5]\n";


}
}

close(TEMP);


$cmd=(`nohup Rscript $DIR/class/scripts/bubble.R $DIR/mapping/@{myout[$#myout]}_${database}_${organism}_functional_classification_temp.tsv $OUTPATH @{myout[$#myout]}_${database}_${organism} $database &`);
$cmd=(`rm -rf $DIR/mapping/@{myout[$#myout]}_${database}_${organism}_functional_classification_temp.tsv $DIR/mapping/@{myout[$#myout]}_input_list.txt || true`); ## Edited 2010/07/24
}
if($plot eq "no")
{
print "No graphical output prefered";
}

if(${organism} eq "Hs" || ${organism} eq "goa_human" || ${organism} eq "hsa")
{
${organism} = "Human/Homo sapiens";
}
print"=================\nRun successful. Check your output directory $OUTPATH \n=================\n\nParameters used:\n\nOrganism:\t\t${organism}\nbackground genes:\t$background\nIdentitiy:\t\t$mytype\nDatabase used:\t\t$database\nOutput file:\t\t$OUTPATH@{myout[$#myout]}_${database}_functional_classification.tsv\n\t\tWARNING: Your output is not sorted with P-val/FDR.\n\n\n---------------------\n\nAuthor: Santhilal Subhash\nsanthilal.subhash\@gu.se\n";

$end_run = time();
$run_time = $end_run - $start_run;
print "GeneSCF elapse time $run_time seconds\n";




Binary file added class/lib/.DS_Store
Binary file not shown.
Loading

0 comments on commit 5fcaa26

Please sign in to comment.