Skip to content

Commit

Permalink
Added utility to convert the Dictionary of an existing HDT to FM-Index.
Browse files Browse the repository at this point in the history
git-svn-id: http://hdt-it.googlecode.com/svn/trunk@239 ca5865f0-ea67-0c57-7e18-94a3701c88e0
  • Loading branch information
mario.arias committed Aug 17, 2012
1 parent 2c00064 commit 7e43168
Show file tree
Hide file tree
Showing 14 changed files with 289 additions and 135 deletions.
9 changes: 6 additions & 3 deletions hdt-lib/src/dictionary/LiteralDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,11 @@ void LiteralDictionary::load(std::istream & input, ControlInformation & ci, Prog
class LiteralIterator : public IteratorUCharString {
private:
IteratorUCharString *child;
unsigned char *nextItem;
unsigned char *previous, *nextItem;
bool goon;

public:
LiteralIterator(IteratorUCharString *child) : child(child), nextItem(NULL), goon(false) {
LiteralIterator(IteratorUCharString *child) : child(child), previous(NULL), nextItem(NULL), goon(false) {
if(child->hasNext()) {
nextItem = child->next();
}
Expand All @@ -232,7 +232,10 @@ class LiteralIterator : public IteratorUCharString {
}

unsigned char *next() {
unsigned char *previous = nextItem;
if(previous) {
child->freeStr(previous);
}
previous = nextItem;
if(child->hasNext()) {
nextItem = child->next();
} else {
Expand Down
8 changes: 4 additions & 4 deletions hdt-lib/src/dictionary/PFCDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,19 +269,19 @@ void PFCDictionary::import(Dictionary *other, ProgressListener *listener) {
}

IteratorUCharString *PFCDictionary::getSubjects() {
throw "Not implemented";
return subjects->listAll();
}

IteratorUCharString *PFCDictionary::getPredicates() {
throw "Not implemented";
return predicates->listAll();
}

IteratorUCharString *PFCDictionary::getObjects() {
throw "Not implemented";
return objects->listAll();
}

IteratorUCharString *PFCDictionary::getShared() {
throw "Not implemented";
return shared->listAll();
}

void PFCDictionary::save(std::ostream & output, ControlInformation & controlInformation, ProgressListener *listener)
Expand Down
2 changes: 2 additions & 0 deletions hdt-lib/src/libdcs/CSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class CSD

virtual void dumpAll()=0;

virtual hdt::IteratorUCharString *listAll()=0;

/** Returns the number of strings in the dictionary. */
uint32_t getLength();

Expand Down
2 changes: 2 additions & 0 deletions hdt-lib/src/libdcs/CSD_Cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class CSD_Cache : public CSD
*/
uint decompress(unsigned char **dict);

hdt::IteratorUCharString *listAll() { return child->listAll(); }

/** Returns the size of the structure in bytes. */
uint64_t getSize();

Expand Down
2 changes: 2 additions & 0 deletions hdt-lib/src/libdcs/CSD_Cache2.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class CSD_Cache2 : public CSD
*/
uint decompress(unsigned char **dict);

hdt::IteratorUCharString *listAll() { return child->listAll(); }

/** Returns the size of the structure in bytes. */
uint64_t getSize();

Expand Down
3 changes: 2 additions & 1 deletion hdt-lib/src/libdcs/CSD_FMIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ CSD_FMIndex::CSD_FMIndex(hdt::IteratorUCharString *it, bool sparse_bitsequence,
samplingsPositions.push_back(total);
}

it->freeStr(currentStr);
total++;
}

Expand All @@ -126,7 +127,7 @@ CSD_FMIndex::CSD_FMIndex(hdt::IteratorUCharString *it, bool sparse_bitsequence,

if (use_sample) {
bitmap = new uint[(total + 1 + W) / W];
memset((void*)bitmap, (total + 1 + W) / W, 0);
memset((void*)bitmap, 0, 4*((total + 1 + W) / W));
bitset(bitmap, 0);
for (size_t i=0;i<samplingsPositions.size();i++){
bitset(bitmap, samplingsPositions[i]);
Expand Down
2 changes: 2 additions & 0 deletions hdt-lib/src/libdcs/CSD_FMIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ namespace csd{

void fillSuggestions(const char *base, vector<string> &out, int maxResults);

hdt::IteratorUCharString *listAll() { throw "Not implemented"; }

/** General destructor. */
~CSD_FMIndex();

Expand Down
69 changes: 35 additions & 34 deletions hdt-lib/src/libdcs/CSD_HTFC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre
this->nblocks = 0;

uint64_t reservedSize = 1024;
uchar *textfc = (uchar*)malloc(reservedSize*sizeof(uchar));
unsigned char *textfc = (unsigned char*)malloc(reservedSize*sizeof(unsigned char));
uint64_t bytesfc = 0;

vector<uint> xblocks; // Temporal storage for start positions

uchar *previousStr, *currentStr = NULL;
unsigned char *previousStr, *currentStr = NULL;
uint previousLength = 0, currentLength = 0;

while (it->hasNext())
Expand All @@ -77,7 +77,7 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre
reservedSize=(bytesfc+currentLength+1)*2;
}
}
textfc = (uchar*)realloc(textfc, reservedSize*sizeof(uchar));
textfc = (unsigned char*)realloc(textfc, reservedSize*sizeof(unsigned char));
}

if ((numstrings % blocksize) == 0)
Expand Down Expand Up @@ -120,17 +120,18 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre

// New string processed
numstrings++;
previousStr = currentStr;
memcpy(previousStr, currentStr, currentLength);
previousLength = currentLength;

it->freeStr(currentStr);
//NOTIFYCOND(listener, "Converting dictionary to HTFC", length, it->getNumberOfElements());
}

// Storing the final byte position in the vector of positions
xblocks.push_back(bytesfc);

// Trunc encoded sequence to save unused memory
textfc = (uchar *) realloc(textfc, bytesfc*sizeof(uchar));
textfc = (unsigned char *) realloc(textfc, bytesfc*sizeof(unsigned char));

/********************************
* HERE STARTS HuTucker
Expand All @@ -143,7 +144,7 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre
leafs = ht.getCodes(&HTcode, &tree);

uint64_t tsize = reservedSize/2;
text = (uchar*)malloc(tsize*sizeof(uchar));
text = (unsigned char*)malloc(tsize*sizeof(unsigned char));
for (uint64_t i=0; i<tsize; i++) text[i] = 0; // Fixme: Replace for calloc

// Auxiliar variables for Hu-Tucker encoding
Expand All @@ -162,7 +163,7 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre
tsize=(bytes+maxlength+1)*2;
}
}
text = (uchar*)realloc(text, tsize*sizeof(uchar));
text = (unsigned char*)realloc(text, tsize*sizeof(unsigned char));

for (uint64_t j=bytes+1; j<tsize; j++) text[j] = 0;
}
Expand All @@ -177,7 +178,7 @@ CSD_HTFC::CSD_HTFC(hdt::IteratorUCharString *it, uint32_t blocksize, hdt::Progre
cblocks++;

// Encoding the first string
uchar *first = new uchar[maxlength*2];
unsigned char *first = new unsigned char[maxlength*2];
first[0] = 0;
uint fb = 0, fo = 0; // Variables managing bytes and offsets in the string 'first'

Expand Down Expand Up @@ -281,7 +282,7 @@ CSD_HTFC::~CSD_HTFC()
delete blocks;
}

uint32_t CSD_HTFC::locate(const uchar *s, uint32_t len)
uint32_t CSD_HTFC::locate(const unsigned char *s, uint32_t len)
{
if(!text || !blocks)
return 0;
Expand Down Expand Up @@ -322,7 +323,7 @@ void CSD_HTFC::dumpBlock(uint block) {
}
cout << "Dump block: " << block << endl;
uint pos = blocks->getField(block);
uchar *string = new uchar[maxlength+1];
unsigned char *string = new unsigned char[maxlength+1];

uint slen = strlen((char*)text+pos)+1;

Expand Down Expand Up @@ -359,7 +360,7 @@ void CSD_HTFC::dumpBlock(uint block) {
delete [] string;
}

uchar* CSD_HTFC::extract(uint32_t id)
unsigned char* CSD_HTFC::extract(uint32_t id)
{
if(!text || !blocks) {
return NULL;
Expand All @@ -368,7 +369,7 @@ uchar* CSD_HTFC::extract(uint32_t id)
if ((id > 0) && (id <= numstrings))
{
// Allocating memory for the string
uchar *s = new uchar[maxlength+1];
unsigned char *s = new unsigned char[maxlength+1];

// Calculating block and offset
uint block = (id-1)/blocksize;
Expand All @@ -393,7 +394,7 @@ uint64_t CSD_HTFC::getSize()
if(!text || !blocks) {
return 0;
}
return bytes*sizeof(uchar)+blocks->getSize()+sizeof(CSD_HTFC);
return bytes*sizeof(unsigned char)+blocks->getSize()+sizeof(CSD_HTFC);
}

void CSD_HTFC::save(ofstream & fp)
Expand All @@ -402,12 +403,12 @@ void CSD_HTFC::save(ofstream & fp)
return;
}

saveValue<uchar>(fp, type);
saveValue<unsigned char>(fp, type);
saveValue<uint32_t>(fp, numstrings);
saveValue<uint32_t>(fp, tlength);
saveValue<uint32_t>(fp, maxlength);
saveValue<uint64_t>(fp, bytes);
saveValue<uchar>(fp, text, bytes);
saveValue<unsigned char>(fp, text, bytes);
saveValue<uint32_t>(fp, blocksize);
saveValue<uint32_t>(fp, nblocks);
blocks->save(fp);
Expand Down Expand Up @@ -446,7 +447,7 @@ CSD* CSD_HTFC::load(ifstream & fp)
}
//cout << "FINAL Read: " << counter << " / " << dicc->bytes << endl;
#else
dicc->text = (uchar *) malloc(dicc->bytes*sizeof(unsigned char*));
dicc->text = (unsigned char *) malloc(dicc->bytes*sizeof(unsigned char*));
fp.read((char *)dicc->text, dicc->bytes);
#endif

Expand Down Expand Up @@ -515,10 +516,10 @@ CSD* CSD_HTFC::load(ifstream & fp)
return dicc;
}

bool CSD_HTFC::locateBlock(const uchar *s, uint *block)
bool CSD_HTFC::locateBlock(const unsigned char *s, uint *block)
{
uint slen = strlen((char*)s)+1;
uchar *encoded = new uchar[2*slen];
unsigned char *encoded = new unsigned char[2*slen];
encoded[0] = 0;

// Pattern (s) encoding
Expand Down Expand Up @@ -583,14 +584,14 @@ bool CSD_HTFC::locateBlock(const uchar *s, uint *block)
return false;
}

uint CSD_HTFC::locateInBlock(uint block, const uchar *s, uint len)
uint CSD_HTFC::locateInBlock(uint block, const unsigned char *s, uint len)
{
if(block>=nblocks){
return 0;
}

uchar *deltaseq = new uchar[DELTA];
uchar *tmp = new uchar[maxlength];
unsigned char *deltaseq = new unsigned char[DELTA];
unsigned char *tmp = new unsigned char[maxlength];
uint delta, tmplen;
uint offset = 0;

Expand Down Expand Up @@ -652,9 +653,9 @@ uint CSD_HTFC::locateInBlock(uint block, const uchar *s, uint len)
return id;
}

void CSD_HTFC::extractInBlock(uint block, uint o, uchar *s)
void CSD_HTFC::extractInBlock(uint block, uint o, unsigned char *s)
{
uchar *deltaseq = new uchar[DELTA];
unsigned char *deltaseq = new unsigned char[DELTA];
uint delta;
uint offset = 0;

Expand All @@ -676,19 +677,19 @@ void CSD_HTFC::extractInBlock(uint block, uint o, uchar *s)
delete [] deltaseq;
}

void CSD_HTFC::decompressDelta(uchar *seq, uint *pos, uint *offset, uchar *deltaseq)
void CSD_HTFC::decompressDelta(unsigned char *seq, uint *pos, uint *offset, unsigned char *deltaseq)
{
uint i = 0;

do
{
deltaseq[i] = (uchar)decodeHT(seq, pos, offset);
deltaseq[i] = (unsigned char)decodeHT(seq, pos, offset);
i++;
}
while (deltaseq[i-1] < 128);
}

uint CSD_HTFC::decompressFirstWord(uchar *seq, uint *pos, uchar *word)
uint CSD_HTFC::decompressFirstWord(unsigned char *seq, uint *pos, unsigned char *word)
{
uint ptr = 0, offset = 0;

Expand All @@ -703,7 +704,7 @@ uint CSD_HTFC::decompressFirstWord(uchar *seq, uint *pos, uchar *word)
return ptr;
}

uint CSD_HTFC::decompressWord(uchar *seq, uint *pos, uint* offset, uchar *suffix)
uint CSD_HTFC::decompressWord(unsigned char *seq, uint *pos, uint* offset, unsigned char *suffix)
{
uint ptr = 0;

Expand All @@ -718,7 +719,7 @@ uint CSD_HTFC::decompressWord(uchar *seq, uint *pos, uint* offset, uchar *suffix
return ptr;
}

uchar CSD_HTFC::decodeHT(uchar *seq, uint *pos, uint *offset)
unsigned char CSD_HTFC::decodeHT(unsigned char *seq, uint *pos, uint *offset)
{
// REVISAR: OTRA IMPLEMENTACION QUE HAGA LOS DESPLAZAMIENTOS
// DE UNO EN UNO CONSIDERANDO UNA ESTRUCTURA TEMPORAL DONDE
Expand All @@ -739,12 +740,12 @@ uchar CSD_HTFC::decodeHT(uchar *seq, uint *pos, uint *offset)
}
}

return (uchar)HTtree[node].symbol;
return (unsigned char)HTtree[node].symbol;
}

void CSD_HTFC::encodeHT(uint code, uint len, uchar *seq, uint *pos, uint *offset)
void CSD_HTFC::encodeHT(uint code, uint len, unsigned char *seq, uint *pos, uint *offset)
{
uchar uccode;
unsigned char uccode;
uint uicode;
uint processed = 0;

Expand All @@ -753,7 +754,7 @@ void CSD_HTFC::encodeHT(uint code, uint len, uchar *seq, uint *pos, uint *offset
// "Saco fuera" los bits ya procesados en 'code'.
uicode = code << (W-len+processed);
// Me quedo con los que quiero
uccode = (uchar)(uicode >> (W-(8-(*offset))));
uccode = (unsigned char)(uicode >> (W-(8-(*offset))));
// Los aado en la posicin actual
seq[*pos] = seq[*pos] | uccode;

Expand All @@ -766,13 +767,13 @@ void CSD_HTFC::encodeHT(uint code, uint len, uchar *seq, uint *pos, uint *offset
if (len-processed > 0)
{
uicode = code << (W-len+processed);
uccode = (uchar)(uicode >> (W-(8-(*offset))));
uccode = (unsigned char)(uicode >> (W-(8-(*offset))));
seq[*pos] = seq[*pos] | uccode;
(*offset) += len-processed;
}
}

uint CSD_HTFC::longest_common_prefix(const uchar* str1, const uchar* str2, uint lstr1, uint lstr2)
uint CSD_HTFC::longest_common_prefix(const unsigned char* str1, const unsigned char* str2, uint lstr1, uint lstr2)
{
uint delta = 0;
uint length = lstr1;
Expand Down
2 changes: 2 additions & 0 deletions hdt-lib/src/libdcs/CSD_HTFC.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class CSD_HTFC : public CSD

void fillSuggestions(const char *base, vector<string> &out, int maxResults);

hdt::IteratorUCharString *listAll() { throw "Not implemented"; }

protected:
uint64_t bytes; //! Size of the Front-Coding encoded sequence (in bytes).
uchar *text; //! Front-Coding encoded sequence.
Expand Down
Loading

0 comments on commit 7e43168

Please sign in to comment.