Skip to content

Commit

Permalink
Merge pull request #51 from eseiler/misc/util_scripts
Browse files Browse the repository at this point in the history
[MISC] Add and fix util scripts
  • Loading branch information
eseiler authored Aug 13, 2021
2 parents 10ec49b + 609ecbf commit 9fb8115
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 104 deletions.
1 change: 1 addition & 0 deletions util/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ bin/
$ tree src/bash_scripts/
src/bash_scripts/
├── benchmark.sh # Runs benchmarks for Raptor
├── benchmark_wk.sh # Runs different w/k combinations for Raptor
├── count_minimisers.sh # Counts a variety of different minimizers for a data set
├── dream_yara.sh # Runs benchmarks for DREAM-Yara
├── original_dream_yara.sh # Runs benchmarks for the original DREAM-Yara
Expand Down
120 changes: 68 additions & 52 deletions util/src/bash_scripts/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,73 +1,89 @@
#!/bin/bash
set -e

READ_LENGTH=250
READ_LENGTH=100
W=23
K=19
ERRORS=2
HASH=2
SIZE="4096m"
SIZES="1g 2g 4g"
THREADS=4
BIN_NUMBER=1024
BINARY_DIR="<path to built binaries>" # containing the raptor binary
BIN_DIR="<bin path>" # output directory of simulation. the directory that contains the BIN_NUMBER directory
BENCHMARK_DIR="<path>" # directory where the benchmarks should be run. Input data will be copied here. e.g. /dev/shm/username; BIN_NUMBER directory will be created.
INPUT_DIR="<bin path>" # output directory of simulation. the directory that contains the BIN_NUMBER directory
BENCHMARK_DIR="<path>" # directory where results should be stored. E.g., /dev/shm/username; BIN_NUMBER directory will be created.
COPY_INPUT=false # If true, input data will be copied from INPUT_DIR to BENCHMARK_DIR.
EVAL_ENERGY=true # If true, use perf to measure power/energy-pkg/ and power/energy-ram/.

working_directory=$BENCHMARK_DIR/$BIN_NUMBER
mkdir -p $working_directory/bins/
mkdir -p $working_directory/reads/
mkdir -p $working_directory

for i in $(seq -f "$BIN_DIR/$BIN_NUMBER/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)))
do
cp $i $working_directory/bins/
done
seq -f "$working_directory/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)) > $working_directory/bins.list
if [ "$COPY_INPUT" = true ] ; then
echo -n "Copying input..."
mkdir -p $working_directory/bins/
mkdir -p $working_directory/reads/

cp $BIN_DIR/$BIN_NUMBER/reads_e$ERRORS\_$READ_LENGTH/all.fastq $working_directory/reads/
for i in $(seq -f "$INPUT_DIR/$BIN_NUMBER/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)))
do
cp $i $working_directory/bins/
done
seq -f "$working_directory/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)) > $working_directory/bins.list

do_task () {
ibf_filename=$working_directory/$w\_$k\_$SIZE.ibf # Does not contain HASH
build_log=$working_directory/$w\_$k\_$SIZE\_build.log
echo "Building IBF with ($w, $k)-minimisers with $HASH hashes and of size $SIZE"
/usr/bin/time -o $build_log -v \
$BINARY_DIR/raptor build \
--output $ibf_filename \
--kmer $k \
--window $w \
--size $SIZE \
--threads $THREADS \
--hash $HASH \
$working_directory/bins.list
cp $INPUT_DIR/$BIN_NUMBER/reads_e$ERRORS\_$READ_LENGTH/all.fastq $working_directory/reads/
read_file=$working_directory/reads/all.fastq
echo "Done."
else
seq -f "$INPUT_DIR/$BIN_NUMBER/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)) > $working_directory/bins.list
read_file=$INPUT_DIR/$BIN_NUMBER/reads_e$ERRORS\_$READ_LENGTH/all.fastq
fi

query_log=$working_directory/$w\_$k\_$SIZE\_query.log # Does not contain HASH
query_out=$working_directory/$w\_$k\_$SIZE.out
echo "Searching IBF for reads of length $READ_LENGTH containing $ERRORS errors"
/usr/bin/time -o $query_log -v \
$BINARY_DIR/raptor search \
--query $working_directory/reads/all.fastq \
--index $ibf_filename \
--output $query_out \
--threads $THREADS \
--error $ERRORS \
--pattern $READ_LENGTH \
--tau 0.9999 \
--time
launch_build() {
if [ "$EVAL_ENERGY" = true ] ; then
perf stat -o $build_perf -e power/energy-pkg/,power/energy-ram/ "$@"
else
"$@"
fi
}

rm $ibf_filename
launch_query() {
if [ "$EVAL_ENERGY" = true ] ; then
perf stat -o $query_perf -e power/energy-pkg/,power/energy-ram/ "$@"
else
"$@"
fi
}

pidlist=""
for size in $SIZES; do
ibf_filename=$working_directory/$W\_$K\_$size.ibf # Does not contain HASH
build_log=$working_directory/$W\_$K\_$size\_build.log
build_perf=$working_directory/$W\_$K\_$size\_build.perf
echo "Building IBF with ($W, $K)-minimisers with $HASH hashes and of size $size"
launch_build /usr/bin/time -o $build_log -v \
$BINARY_DIR/raptor build \
--output $ibf_filename \
--kmer $K \
--window $W \
--size $size \
--threads $THREADS \
--hash $HASH \
$working_directory/bins.list

for w in $(seq 23 2 32 && seq 32 2 80)
do
for k in 16 17 18 19 20
do
if [[ $w != 23 ]] || [[ $k != 20 ]]; then # Segfault for READ_LENGTH = 150
do_task & pidlist="$pidlist $!"
fi
done
for job in $pidlist
do
wait $job
done
query_log=$working_directory/$W\_$K\_$size\_query.log # Does not contain HASH
query_perf=$working_directory/$W\_$K\_$size\_query.perf
query_out=$working_directory/$W\_$K\_$size.out
echo "Searching IBF for reads of length $READ_LENGTH containing $ERRORS errors"
launch_query /usr/bin/time -o $query_log -v \
$BINARY_DIR/raptor search \
--query $read_file \
--index $ibf_filename \
--output $query_out \
--threads $THREADS \
--error $ERRORS \
--pattern $READ_LENGTH \
--tau 0.9999 \
--time

rm $ibf_filename
done

# Uncomment for basic cleanup, does not delete results
Expand Down
91 changes: 91 additions & 0 deletions util/src/bash_scripts/benchmark_wk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash
set -e

READ_LENGTH=250
ERRORS=2
HASH=2
SIZE="4096m"
THREADS=4
BIN_NUMBER=1024
BINARY_DIR="<path to built binaries>" # containing the raptor binary
INPUT_DIR="<bin path>" # output directory of simulation. the directory that contains the BIN_NUMBER directory
BENCHMARK_DIR="<path>" # directory where results should be stored. E.g., /dev/shm/username; BIN_NUMBER directory will be created.
COPY_INPUT=false # If true, input data will be copied from INPUT_DIR to BENCHMARK_DIR.

working_directory=$BENCHMARK_DIR/$BIN_NUMBER
mkdir -p $working_directory

if [ "$COPY_INPUT" = true ] ; then
echo -n "Copying input..."
mkdir -p $working_directory/bins/
mkdir -p $working_directory/reads/

for i in $(seq -f "$INPUT_DIR/$BIN_NUMBER/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)))
do
cp $i $working_directory/bins/
done
seq -f "$working_directory/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)) > $working_directory/bins.list

cp $INPUT_DIR/$BIN_NUMBER/reads_e$ERRORS\_$READ_LENGTH/all.fastq $working_directory/reads/
read_file=$working_directory/reads/all.fastq
echo "Done."
else
seq -f "$INPUT_DIR/$BIN_NUMBER/bins/bin_%0${#BIN_NUMBER}g.fasta" 0 1 $((BIN_NUMBER-1)) > $working_directory/bins.list
read_file=$INPUT_DIR/$BIN_NUMBER/reads_e$ERRORS\_$READ_LENGTH/all.fastq
fi

do_task () {
ibf_filename=$working_directory/$w\_$k\_$SIZE.ibf # Does not contain HASH
build_log=$working_directory/$w\_$k\_$SIZE\_build.log
echo "Building IBF with ($w, $k)-minimisers with $HASH hashes and of size $SIZE"
/usr/bin/time -o $build_log -v \
$BINARY_DIR/raptor build \
--output $ibf_filename \
--kmer $k \
--window $w \
--size $SIZE \
--threads $THREADS \
--hash $HASH \
$working_directory/bins.list

query_log=$working_directory/$w\_$k\_$SIZE\_query.log # Does not contain HASH
query_out=$working_directory/$w\_$k\_$SIZE.out
echo "Searching IBF for reads of length $READ_LENGTH containing $ERRORS errors"
/usr/bin/time -o $query_log -v \
$BINARY_DIR/raptor search \
--query $read_file \
--index $ibf_filename \
--output $query_out \
--threads $THREADS \
--error $ERRORS \
--pattern $READ_LENGTH \
--tau 0.9999 \
--time

rm $ibf_filename
}

pidlist=""

# 5 jobs in parallel
for w in $(seq 23 2 32 && seq 32 2 80) # w=23,25,27,29,31,32,34,36,38,...,80
do
for k in 16 17 18 19 20
do
if [[ $w != 23 ]] || [[ $k != 20 ]]; then # Segfault for READ_LENGTH = 150
do_task & pidlist="$pidlist $!"
fi
done
for job in $pidlist
do
wait $job
done
done

# Uncomment for basic cleanup, does not delete results
# chmod -R 744 $working_directory/bins
# chmod -R 744 $working_directory/reads
# rm -f $working_directory/bins/*.fasta
# rm -d $working_directory/bins
# rm -f $working_directory/reads/all.fastq
# rm -d $working_directory/reads
4 changes: 2 additions & 2 deletions util/src/bash_scripts/simulate.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -e

BINARY_DIR="<path to built binaries>"
OUT_DIR="<output path>"
BINARY_DIR="<path to built binaries>" # Dir containing "mason_genome", "split_sequence", etc.
OUT_DIR="<output path>" # Where simulated data should be stored
LENGTH=4294967296 # 4*2^30 = 4GiB
SEED=42 # was 20181406 before, but was hardcoded to 42 in seqan
BIN_NUMBER=1024
Expand Down
Loading

0 comments on commit 9fb8115

Please sign in to comment.