Skip to content

Commit

Permalink
Fix dedup output order for unmapped pairs to the same file
Browse files Browse the repository at this point in the history
  • Loading branch information
Phlya committed Apr 9, 2024
1 parent 0c1a0c7 commit 6000a2d
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions pairtools/lib/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,19 @@ def streaming_dedup(
# Clean up dataframe:
df_chunk = df_chunk.drop(columns=["duplicate"])

# Stream the pairs:
# If outstream_dups is the same as outstream, we save all mapped pairs to the same file
# Save the pairs:

# Stream unmapped:
if outstream_unmapped:
df_chunk.loc[~mask_mapped, :].to_csv(
outstream_unmapped,
index=False,
header=False,
sep="\t",
quoting=QUOTE_NONE,
)

# If outstream_dups is the same as outstream, we save the mapped pairs to the same file
if outstream_dups == outstream:
df_chunk.loc[mask_mapped, :].to_csv(
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
Expand All @@ -116,16 +126,6 @@ def streaming_dedup(
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)

# Stream unmapped:
if outstream_unmapped:
df_chunk.loc[~mask_mapped, :].to_csv(
outstream_unmapped,
index=False,
header=False,
sep="\t",
quoting=QUOTE_NONE,
)

t1 = time.time()
t = t1 - t0
logger.debug(f"total time: {t}")
Expand Down

0 comments on commit 6000a2d

Please sign in to comment.