Skip to content

Commit

Permalink
commit changes
Browse files Browse the repository at this point in the history
  • Loading branch information
xiezhq committed Feb 19, 2017
1 parent c5f7448 commit be31f6c
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 31 deletions.
Binary file modified __pycache__/constants.cpython-33.pyc
Binary file not shown.
Binary file modified __pycache__/is_analysis.cpython-33.pyc
Binary file not shown.
Binary file modified __pycache__/tools.cpython-33.pyc
Binary file not shown.
14 changes: 8 additions & 6 deletions batch4hmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

# To do IS element prediction before summarizing prediction if PREDICT = 1
# To summarize prediction based on the previous prediction to save time if PREDICT = 0
#PREDICT = 1
PREDICT = 0
PREDICT = 1
#PREDICT = 0

python3 = '/usr/local/bin/python3'
#cmd = '/u/zhiqxie/xie/is/isfinder/isPredict.py'
Expand Down Expand Up @@ -48,8 +48,10 @@ def batch(args):
dir2proteome4org = '/home/data/insertion_sequence/output4FragGeneScan1.19_illumina_5'
dir2hmmsearchResults = '/home/data/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
'''
dir2proteome4org = '/data2/zhiqxie/insertion_sequence/output4FragGeneScan1.19_illumina_5'
dir2hmmsearchResults = '/data2/zhiqxie/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
#dir2proteome4org = '/data2/zhiqxie/insertion_sequence/output4FragGeneScan1.19_illumina_5'
#dir2hmmsearchResults = '/data2/zhiqxie/insertion_sequence/output4hmmsearch_illumina_5_cdhit30'
dir2proteome4org = 'proteome'
dir2hmmsearchResults = 'hmm'
cmdargs = [python3, cmd, '', dir2proteome4org, dir2hmmsearchResults]
# summarize IS elements in each genome DNA and each organism
for org in file4orgs.keys():
Expand All @@ -58,9 +60,9 @@ def batch(args):
cmdargs[2] = file
cmdline = ' '.join(cmdargs)
callcmd = shlex.split(cmdline)
#subprocess.check_call(callcmd, shell=False, universal_newlines=False, stdout=subprocess.DEVNULL)
subprocess.check_call(callcmd, shell=False, universal_newlines=False)
#subprocess.check_call(callcmd, shell=False, universal_newlines=False)
subprocess.check_output(callcmd, shell=False, universal_newlines=False, stderr=subprocess.STDOUT)
#subprocess.check_output(callcmd, shell=False, universal_newlines=False, stderr=subprocess.STDOUT)
print(org, file, 'was processed')

# get summarization of IS elements for each organism and write summarization
Expand Down
6 changes: 4 additions & 2 deletions pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,7 +1784,7 @@ def pred(args):
tblout_list = prepare4tblout_list(hmm_path, fileids)
if len(tblout_list) == 0:
print('No results returned by HMM search was found for sequences in', args['dna_list'])
return 0
return None

#print('Processing tblout files at', datetime.datetime.now().ctime())
mtblout_hits_sorted = []
Expand Down Expand Up @@ -1907,7 +1907,7 @@ def pred(args):
if hits_sorted == None or len(hits_sorted) == 0:
e = 'No hit was found for {} {}'.format(seqid, seqid_hits)
print(e)
return
return 0

hits_sorted_refined = refine_hmm_hits_evalue(hits_sorted, e_value)
if len(hits_sorted_refined) == 0:
Expand All @@ -1916,6 +1916,8 @@ def pred(args):
continue
mtblout_hits_sorted_refined.append((seqid, hits_sorted_refined))
#print('Finish refining hits for each DNA sequence', datetime.datetime.now().ctime())
if len(mtblout_hits_sorted_refined) == 0:
return None

mtblout_hits_sorted = mtblout_hits_sorted_refined

Expand Down
2 changes: 1 addition & 1 deletion readme
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Version 1.1.1
2.3.2.3 Save and close constants.py.

2.4 Let's try an example, NC_012624.fna.
The command below scans NC_012624.fna (a genome sequence from genome Sulfolobus_islandicus_Y_N_15_51), and outputs all results in prediction directory:
The command below scans NC_012624.fna (the genome sequence from Sulfolobus_islandicus_Y_N_15_51), and outputs all results in prediction directory:

python3 isescan.py NC_012624.fna proteome hmm

Expand Down
37 changes: 15 additions & 22 deletions tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,31 +49,24 @@ def write2file(filePath = None, content = None):
with open(filePath, 'w') as fp:
fp.write(content)

# To split a huge file containing many fasta sequences into many individual fasta files
# with the file names as NC_xxxxx.x which must already exist in each >gi line in original
# huge fasta file.
# Split an multiple-sequence fasta file containing multiple fasta sequences into multiple individual fasta files
# with the file names with sequence id included.
def split_tandem_fasta(huge_fasta_file, output_path):
fp = open(huge_fasta_file, "r")
mfastaFileName = os.path.basename(huge_fasta_file)
fp_fasta = open("/dev/null", "r")
for line in fp:
if line.isspace():
continue
if line[0] == '>':
fp_fasta.close()
if "|NC_" in line or ">NC_" in line:
nc_start = line.find("NC_")
nc_end = line.find('|', nc_start+3)
if nc_end == -1:
nc_end = line.find(' ', nc_start+3)
fasta_file_name = line[nc_start:nc_end]
else:
fasta_file_name = line[1:-1]
fasta_file_name = output_path + '/' + fasta_file_name
fasta_file_name += ".fna"
fp_fasta= open(fasta_file_name, "w")
fp_fasta.write(line)
with open(huge_fasta_file, "r") as fp:
for line in fp:
line = line.strip()
if len(line) == 0:
continue
if line[0] == '>':
fp_fasta.close()
seqid = line[1:].split(maxsplit=1)[0]
fasta_file_name = '.'.join([mfastaFileName, seqid])
fasta_file = os.path.join(output_path, fasta_file_name)
fp_fasta= open(fasta_file, "w")
fp_fasta.write(line+'\n')
fp_fasta.close()
fp.close()


# This function returns a generator, using a generator comprehension. The generator returns the string sliced,
Expand Down

0 comments on commit be31f6c

Please sign in to comment.