Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

self - delimiting codes #193

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions benchmark/self_delimiting_codes/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
include ../../Make.helper
SRC_DIR = src
BIN_DIR = bin
LIBS = -lsdsl
RES_FILE = results/result.csv #result file of benchmark
VAT_FILE = results/vat.csv #vector assignment table (vector name -> sdsl type)
TC_FILE = results/tc.csv #test case table (contains only test case names)

#utility
empty:=
space:= $(empty) $(empty)
comma:= ,

#load test cases
TC_IDS := $(call config_ids,test_case.config)
TC_SRC := $(foreach TC_ID,$(TC_IDS),\
$(call config_select,test_case.config,$(TC_ID),2))
TC_FILES := $(foreach TC_ID,$(TC_IDS),\
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
../tmp/BWT_MTF.$(TC_ID),\
$(call config_select,test_case.config,$(TC_ID),2)))

all: $(RES_FILE)

timing: $(RES_FILE)
@cd visualize;make

#compilation of bwt - mtf - transform algorithm
$(BIN_DIR)/gen_bwt_mtf: $(SRC_DIR)/gen_bwt_mtf.cpp
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -L$(LIB_DIR) "$(SRC_DIR)/gen_bwt_mtf.cpp"\
-I$(INC_DIR) -o "$(BIN_DIR)/gen_bwt_mtf" $(LIBS) -ldivsufsort -ldivsufsort64

#generation of MTF of BWT
../tmp/BWT_MTF.%: $(TC_SRC) $(BIN_DIR)/gen_bwt_mtf
$(eval TC_ID:=$*)
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5))
@$(BIN_DIR)/gen_bwt_mtf $(TC_PATH) ../tmp/BWT_MTF.$(TC_ID) ../tmp $(NUM_BYTE)

#compilation and creation of vector assignment table
$(BIN_DIR)/sdcbenchmark: $(SRC_DIR)/sdc_benchmark.cpp vectors.config compile_options.config
$(eval VTYPES := $(subst $(space),$(comma),$(strip $(call config_column,vectors.config,2))))
$(eval VNAMES := $(subst $(space),\"$(comma)\",$(strip $(call config_column,vectors.config,3))))
$(eval VNAMES := $(addprefix {\",$(VNAMES)))
$(eval VNAMES := $(addsuffix \"},$(VNAMES)))
$(eval C_OPTIONS:=$(call config_ids,compile_options.config))
@echo "Compiling build for vectors $(VNAMES)"
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -DVTYPES="$(VTYPES)" -DVNAMES="$(VNAMES)" -L$(LIB_DIR)\
"$(SRC_DIR)/sdc_benchmark.cpp" -I$(INC_DIR) -o "$(BIN_DIR)/sdcbenchmark" $(LIBS)
$(eval V_IDS := $(call config_ids,vectors.config))
$(eval V_ASSIGNMENTTABLE := $(subst $(space),\n,$(strip $(foreach V_ID,$(V_IDS),\
$(call config_select,vectors.config,$(V_ID),3);$(call config_select,vectors.config,$(V_ID),2)))))
@echo "Writing Vector Assignment Table"
@echo "vector;sdsltype" > $(VAT_FILE)
@echo "$(V_ASSIGNMENTTABLE)" >> $(VAT_FILE)

#execution and creation of test case table
$(RES_FILE): test_case.config $(TC_FILES) $(BIN_DIR)/sdcbenchmark
$(eval ARGS := $(foreach TC_ID,$(TC_IDS),\
$(call config_select,test_case.config,$(TC_ID),3) $(space) \
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
../tmp/BWT_MTF.$(TC_ID),\
$(call config_select,test_case.config,$(TC_ID),2)) $(space) \
$(call config_select,test_case.config,$(TC_ID),5) ) )
@echo "Executing Benchmark"
@$(BIN_DIR)/sdcbenchmark $(ARGS) | tee $(RES_FILE)
$(eval TC_TABLE := $(subst $(space),\n,$(strip $(call config_column,test_case.config,3))))
@echo "Writing Test Case file"
@echo "testcase\\nOverall" > $(TC_FILE)
@echo "$(TC_TABLE)" >> $(TC_FILE)

include ../Make.download

clean-build:
@echo "Remove executables"
rm -f $(BIN_DIR)/*

clean-result:
@echo "Remove results"
rm -f results/*

cleanall: clean-build clean-result
@cd visualize;make cleanall
55 changes: 55 additions & 0 deletions benchmark/self_delimiting_codes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Benchmarking self delimiting codes

## Methodology

Explored dimensions:

* self - delimiting code implementations
* test cases
* methods (`encoding`, `decoding`)

## Directory structure

* [bin](./bin): Contains the executables of the project.
* [results](./results): Contains the results of the experiments.
* [src](./src): Contains the source code of the benchmark.
* [visualize](./visualize): Contains LaTex files and a makefile for generating a report

## Prerequisites

* To run the test on larger test cases (>= 200 MB), you should have at least 2 GB
of free memory (some vectors have very poor compression).
* For the visualization you need the following software:
- [pdflatex][LT] to generate the pdf reports.
- [pgfplots][PGFP] version 1.10 installed in [LT] to generate plots in pdf reports.

## Usage

* `make timing` compiles the programs, downloads or generates
the test instances, builds the compression vectors,
runs the performance tests and generates a report located at
`visualize/self_delimiting_codes.pdf`. The raw numbers of the encoding / decoding
rates and compression can be found in the file `results/result.csv`.
The used test cases can be found in file `results/tc.csv`.
The tested vectors can be found in file `results/vat.csv`.
The default benchmark took about 6 hours on my machine (Asus P50IJ
Pentium(R) Dual-Core CPU T4500 @ 2.30GHz 2GB).
* All created binaries and test results can be deleted
by calling `make cleanall`.

## Customization of the benchmark

The project contains several configuration files:

* [vectors.config][VCONFIG]: Specify different compression vectors and their used coders.
* [test_case.config][TCCONFIG]: Specify test instances by ID, path, LaTeX-name
for the report, and download URL.
* [compile_options.config][CCONFIG]: Specify compile options by option string.

Note that the benchmark will execute every combination of vectors and test cases.

[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
[PGFP]: http://www.ctan.org/pkg/pgfplots "pgfplots"
[VCONFIG]: ./vectors.config "vectors.config"
[TCCONFIG]: ./test_case.config "test_case.config"
[CCONFIG]: ./compile_options.config "compile_options.config"
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/compile_options.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Compile options
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/results/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
119 changes: 119 additions & 0 deletions benchmark/self_delimiting_codes/src/gen_bwt_mtf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <iostream>
#include <fstream>
#include <sdsl/suffix_arrays.hpp>
#include <string>
#include <vector>

using namespace sdsl;

//routine to save a vector in different formats, see lower implementations
template<class INT_VECTOR, uint8_t num_byte>
void saveVector(const INT_VECTOR &v, const char *dest);

//main function to generate MTF of BWT of an integer vector.
// CSA_WT: used wavelet - tree - based suffix array implementation
// INT_VECTOR: used integer vector for extracting BWT
// num_byte: value indicating how result has to be opened / saved
// srcfile: file from which to generate
// destfile: file where to save result
// tmpdir: directory used for temporary results
// conf_bwt_key: key what is able to fetch bwt after suffix array construction
template<class CSA_WT, class INT_VECTOR, uint8_t num_byte>
void gen_bwt_mtf(const char *srcfile, const char *destfile, const char *tmpdir,
const char *conf_bwt_key) {
//utility for CSA generation
cache_config cc(false, tmpdir, "gen_bwt_mtf_");
INT_VECTOR bwt;

//create suffix array
CSA_WT wt;
construct(wt, srcfile, cc, num_byte);

//compute alphabet table from suffix array
std::vector<uint64_t> alph_tbl( wt.sigma );
for (uint64_t i = 0; i < wt.sigma; i++) {
alph_tbl.push_back( wt.comp2char[i] );
}

//fetch bwt
load_from_file(bwt, cache_file_name(conf_bwt_key, cc));

//create mtf
for (uint64_t i = 0; i < bwt.size(); i++) {
uint64_t c = bwt[i];
//find c in alphabet table and move it to front
uint64_t j = 0;
do {
uint64_t tmp = alph_tbl[j];
alph_tbl[j++] = c;
c = tmp;
} while (c != alph_tbl.front());
//and write it's index to mtf transform of bwt
bwt[i] = j-1;
}

//save everything
saveVector<INT_VECTOR, num_byte>( bwt, destfile );

//and free resources
util::delete_all_files(cc.file_map);
}

//functions for saving an integer vector in different formats
//generic version (raw output)
template<class INT_VECTOR, uint8_t num_byte>
void saveVector(const INT_VECTOR &v, const char *dest) {
std::ofstream out(dest);
out.write((char *)v.data(), num_byte * v.size());
}
//serialization of integer vector
template<>
void saveVector<int_vector<>, 0>(const int_vector<> &v, const char *dest) {
store_to_file(v, dest);
}
//decimal digits
template<>
void saveVector<int_vector<>, 'd'>(const int_vector<> &v, const char *dest) {
std::ofstream out(dest);
if (v.size()) out << v[0];
for (uint64_t i = 1; i < v.size(); i++) {
out << " " << v[i];
}
}

//main function
int main(int argc, char* argv[]) {
if (argc != 5) {
std::cout<<"Usage: input_file output_file temp_dir num_byte" << std::endl;
return 1;
}
std::cout << "Calculate MTF Transform of BWT of " << argv[1]
<< " and store it to " << argv[2] << std::endl;

typedef csa_wt<> csa_wt_byte;
typedef csa_wt<wt_int<>, 64, 64, sa_order_sa_sampling<>, int_vector<>, int_alphabet<>> csa_wt_int;

switch (argv[4][0]) {
case 'd': //decimal digits
gen_bwt_mtf<csa_wt_int, int_vector<>, 'd'>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '0': //serialized integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 0>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '1': //byte integer vector
gen_bwt_mtf<csa_wt_byte, int_vector<8>, 1>(argv[1], argv[2], argv[3], conf::KEY_BWT);
return 0;
case '2': //2 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 2>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '4': //4 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 4>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '8': //8 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 8>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
default:
std::cout << "Illegal num_byte, allowed are 'd', 0, 1, 2, 4, 8" << std::endl;
return 1;
}
}
Loading