commit changes

xiezhq · Feb 19, 2017 · be31f6c · be31f6c
1 parent c5f7448
commit be31f6c
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 31 deletions.
diff --git a/__pycache__/constants.cpython-33.pyc b/__pycache__/constants.cpython-33.pyc
diff --git a/__pycache__/is_analysis.cpython-33.pyc b/__pycache__/is_analysis.cpython-33.pyc
diff --git a/__pycache__/tools.cpython-33.pyc b/__pycache__/tools.cpython-33.pyc
diff --git a/batch4hmp.py b/batch4hmp.py
@@ -12,8 +12,8 @@
 
 # To do IS element prediction before summarizing prediction if PREDICT = 1
 # To summarize prediction based on the previous prediction to save time if PREDICT = 0
-#PREDICT = 1
-PREDICT = 0
+PREDICT = 1
+#PREDICT = 0
 
 python3 = '/usr/local/bin/python3'
 #cmd = '/u/zhiqxie/xie/is/isfinder/isPredict.py'
@@ -48,8 +48,10 @@ def batch(args):
 		dir2proteome4org = '/home/data/insertion_sequence/output4FragGeneScan1.19_illumina_5'
 		dir2hmmsearchResults = '/home/data/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
 		'''
-		dir2proteome4org = '/data2/zhiqxie/insertion_sequence/output4FragGeneScan1.19_illumina_5'
-		dir2hmmsearchResults = '/data2/zhiqxie/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
+		#dir2proteome4org = '/data2/zhiqxie/insertion_sequence/output4FragGeneScan1.19_illumina_5'
+		#dir2hmmsearchResults = '/data2/zhiqxie/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
+		dir2proteome4org = 'proteome'
+		dir2hmmsearchResults = 'hmm'
 		cmdargs = [python3, cmd, '', dir2proteome4org, dir2hmmsearchResults]
 		# summarize IS elements in each genome DNA and each organism 
 		for org in file4orgs.keys():
@@ -58,9 +60,9 @@ def batch(args):
 				cmdargs[2] = file
 				cmdline = ' '.join(cmdargs)
 				callcmd = shlex.split(cmdline)
-				#subprocess.check_call(callcmd, shell=False, universal_newlines=False, stdout=subprocess.DEVNULL)
+				subprocess.check_call(callcmd, shell=False, universal_newlines=False)
 				#subprocess.check_call(callcmd, shell=False, universal_newlines=False)
-				subprocess.check_output(callcmd, shell=False, universal_newlines=False, stderr=subprocess.STDOUT)
+				#subprocess.check_output(callcmd, shell=False, universal_newlines=False, stderr=subprocess.STDOUT)
 				print(org, file, 'was processed')
 
 	# get summarization of IS elements for each organism and write summarization 

diff --git a/pred.py b/pred.py
@@ -1784,7 +1784,7 @@ def pred(args):
 		tblout_list = prepare4tblout_list(hmm_path, fileids)
 	if len(tblout_list) == 0:
 		print('No results returned by HMM search was found for sequences in', args['dna_list'])
-		return 0
+		return None
 
 	#print('Processing tblout files at', datetime.datetime.now().ctime())	
 	mtblout_hits_sorted = []
@@ -1907,7 +1907,7 @@ def pred(args):
 		if hits_sorted == None or len(hits_sorted) == 0:
 			e = 'No hit was found for {} {}'.format(seqid, seqid_hits)
 			print(e)
-			return
+			return 0
 
 		hits_sorted_refined = refine_hmm_hits_evalue(hits_sorted, e_value)
 		if len(hits_sorted_refined) == 0:
@@ -1916,6 +1916,8 @@ def pred(args):
 			continue
 		mtblout_hits_sorted_refined.append((seqid, hits_sorted_refined))
 	#print('Finish refining hits for each DNA sequence', datetime.datetime.now().ctime())
+	if len(mtblout_hits_sorted_refined) == 0:
+		return None
 
 	mtblout_hits_sorted = mtblout_hits_sorted_refined
 

diff --git a/readme b/readme
@@ -69,7 +69,7 @@ Version 1.1.1
 2.3.2.3 Save and close constants.py.
 
 2.4 Let's try an example, NC_012624.fna.
-	The command below scans NC_012624.fna (a genome sequence from genome Sulfolobus_islandicus_Y_N_15_51), and outputs all results in prediction directory:
+	The command below scans NC_012624.fna (the genome sequence from Sulfolobus_islandicus_Y_N_15_51), and outputs all results in prediction directory:
 
 		python3 isescan.py NC_012624.fna proteome hmm
 

diff --git a/tools.py b/tools.py
@@ -49,31 +49,24 @@ def write2file(filePath = None, content = None):
 	with open(filePath, 'w') as fp:
 		fp.write(content)
 
-# To split a huge file containing many fasta sequences into many individual fasta files
-# with the file names as NC_xxxxx.x which must already exist in each >gi line in original
-# huge fasta file.
+# Split an multiple-sequence fasta file containing multiple fasta sequences into multiple individual fasta files
+# with the file names with sequence id included.
 def split_tandem_fasta(huge_fasta_file, output_path):
-	fp = open(huge_fasta_file, "r")
+	mfastaFileName = os.path.basename(huge_fasta_file)
 	fp_fasta = open("/dev/null", "r")
-	for line in fp:
-		if line.isspace():
-			continue
-		if line[0] == '>':
-			fp_fasta.close()
-			if "|NC_" in line or ">NC_" in line:
-				nc_start = line.find("NC_")
-				nc_end = line.find('|', nc_start+3)
-				if nc_end == -1:
-					nc_end = line.find(' ', nc_start+3)
-				fasta_file_name = line[nc_start:nc_end]
-			else:
-				fasta_file_name = line[1:-1]
-			fasta_file_name = output_path + '/' + fasta_file_name
-			fasta_file_name += ".fna" 
-			fp_fasta= open(fasta_file_name, "w")
-		fp_fasta.write(line)
+	with open(huge_fasta_file, "r") as fp:
+		for line in fp:
+			line = line.strip()
+			if len(line) == 0:
+				continue
+			if line[0] == '>':
+				fp_fasta.close()
+				seqid = line[1:].split(maxsplit=1)[0]
+				fasta_file_name = '.'.join([mfastaFileName, seqid])
+				fasta_file = os.path.join(output_path, fasta_file_name)
+				fp_fasta= open(fasta_file, "w")
+			fp_fasta.write(line+'\n')
 	fp_fasta.close()
-	fp.close()
 
 
 # This function returns a generator, using a generator comprehension. The generator returns the string sliced,