Skip to content

Commit

Permalink
hashread improved using a fixed prefix
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed May 11, 2016
1 parent e94679a commit 954b58c
Show file tree
Hide file tree
Showing 7 changed files with 274 additions and 96 deletions.
24 changes: 24 additions & 0 deletions gclib/GBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,30 @@ int strhash(const char* str){
return h;
}

int djb_hash(const char* cp)
{
int h = 5381;
while (*cp)
h = (int)(33 * h ^ (unsigned char) *cp++);
return (h & 0x7FFFFFFF); //always positive
//return h;
//return absolute value of this int:
//int mask = (h >> (sizeof(int) * CHAR_BIT - 1));
//return (h + mask) ^ mask;
}

/* Fowler/Noll/Vo (FNV) hash function, variant 1a */
int fnv1a_hash(const char* cp) {
int h = 0x811c9dc5;
while (*cp) {
h ^= (unsigned char) *cp++;
h *= 0x01000193;
}
//return h;
return (h & 0x7FFFFFFF);
}


// removes the last part (file or directory name) of a full path
// this is a destructive operation for the given string!!!
// the trailing '/' is guaranteed to be there
Expand Down
4 changes: 3 additions & 1 deletion gclib/GBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,9 @@ bool endsWith(const char* s, const char* suffix);
// ELF hash function for strings
int strhash(const char* str);


//alternate hash functions:
int fnv1a_hash(const char* cp);
int djb_hash(const char* cp);

//---- generic base GSeg : genomic segment (interval) --
// coordinates are considered 1-based (so 0 is invalid)
Expand Down
4 changes: 4 additions & 0 deletions gclib/GBitVec.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ class GBitVec {
if (value)
clear_unused_bits();
}
unsigned long getMemorySize() const {
unsigned long r = ((unsigned long) Capacity) * sizeof(BitWord);
return r;
}

/// GBitVec copy ctor.
GBitVec(const GBitVec &RHS) : Size(RHS.size()) {
Expand Down
191 changes: 161 additions & 30 deletions gclib/GHash.hh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
* indexed by a character string (essentially, maps strings to pointers)
*/

//#define HASH_DBG_PRINT 1

#define GSTR_HASH(s) strhash(s)
//#define GSTR_HASH(s) djb_hash(s)
//#define GSTR_HASH(s) fnv1a_hash(s)

template <class OBJ> class GHash {
protected:
Expand Down Expand Up @@ -62,17 +67,22 @@ public:
int Capacity() const { return fCapacity; } // table's size, including the empty slots.
void Resize(int m); // Resize the table to the given size.
int Count() const { return fCount; }// the total number of entries in the table.

// Insert a new entry into the table given key and mark.
// If there is already an entry with that key, leave it unchanged,
const OBJ* Add(const char* ky, const OBJ* ptr=NULL, bool mrk=false);
OBJ* Add(const char* ky, OBJ* ptr=NULL, bool mrk=false);

//same with Add, but frees the old element if it's a replacement
OBJ* fAdd(const char* ky, OBJ* ptr=NULL);

//same as Add, but the key pointer is stored directly, no string duplicate
//is made (shared-key-Add)
const OBJ* shkAdd(const char* ky, const OBJ* ptr, bool mrk=false);
OBJ* shkAdd(const char* ky, OBJ* ptr, bool mrk=false);

// Replace data at key, if the entry's mark is less than
// or equal to the given mark. If there was no existing entry,
// a new entry is inserted with the given mark.
OBJ* Replace(const char* ky, const OBJ* ptr, bool mrk=false);
OBJ* Replace(const char* ky, OBJ* ptr, bool mrk=false);
// Remove a given key and its data
OBJ* Remove(const char* ky);
// Find data OBJ* given key.
Expand Down Expand Up @@ -197,37 +207,141 @@ template <class OBJ> void GHash<OBJ>::Resize(int m){
}

// add a new entry, or update it if it already exists
template <class OBJ> const OBJ* GHash<OBJ>::Add(const char* ky,
const OBJ* pdata,bool mrk){


template <class OBJ> OBJ* GHash<OBJ>::Add(const char* ky,
OBJ* pdata, bool mrk){
register int p,i,x,h,n;
if(!ky) GError("GHash::insert: NULL key argument.\n");
GASSERT(fCount<fCapacity);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
x=HASH2(h,fCapacity);
GASSERT(1<=x && x<fCapacity);
i=-1;
n=fCapacity;
#ifdef HASH_DBG_PRINT
int iterations=0;
int init_p=p;
int init_x=x;
#endif
while(n && hash[p].hash!=-1) {
if ((i==-1)&&(hash[p].hash==-2)) i=p;
if (hash[p].hash==h && strcmp(hash[p].key,ky)==0) {
//replace hash data for this key!
lastkeyptr=hash[p].key;
OBJ* r = (OBJ*) hash[p].data;
hash[p].data = (void*) pdata;
#ifdef HASH_DBG_PRINT
GMessage("Add.R\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
//return (OBJ*)hash[p].data;
return r;
}
p=(p+x)%fCapacity;
n--;
}
if(i==-1) i=p;
#ifdef HASH_DBG_PRINT
GMessage("Add.N\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
//GMessage("GHash::insert: key=\"%s\"\n",ky);
GASSERT(0<=i && i<fCapacity);
GASSERT(hash[i].hash<0);
hash[i].hash=h;
hash[i].mark=mrk;
hash[i].key=Gstrdup(ky);
hash[i].keyalloc=true;
lastkeyptr=hash[i].key;
hash[i].data= (void*) pdata;
fCount++;
if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
GASSERT(fCount<fCapacity);
return pdata;
}

/*
int p,i,x,h;
if(!ky) GError("GHash::insert: NULL key argument.\n");
GASSERT(fCount<fCapacity);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
x=HASH2(h,fCapacity);
GASSERT(1<=x && x<fCapacity);
if (checkReplace(ky, pdata, p, i, h, x)) {
return (OBJ*)hash[p].data;
}
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
//GMessage("GHash::insert: key=\"%s\"\n",ky);
GASSERT(0<=i && i<fCapacity);
GASSERT(hash[i].hash<0);
hash[i].hash=h;
hash[i].mark=mrk;
hash[i].key=Gstrdup(ky);
hash[i].keyalloc=true;
lastkeyptr=hash[i].key;
hash[i].data= (void*) pdata;
fCount++;
if((100*fCount)>=(MAX_LOAD*fCapacity)) Resize(fCount);
GASSERT(fCount<fCapacity);
return pdata;
}
*/
template <class OBJ> OBJ* GHash<OBJ>::fAdd(const char* ky,
OBJ* pdata){
register int p,i,x,h,n;
if(!ky) GError("GHash::insert: NULL key argument.\n");
GASSERT(fCount<fCapacity);
h=strhash(ky);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
x=HASH2(h,fCapacity);
GASSERT(1<=x && x<fCapacity);
i=-1;
n=fCapacity;
while(n && hash[p].hash!=-1){
#ifdef HASH_DBG_PRINT
int iterations=0;
int init_p=p;
int init_x=x;
#endif
while(n && hash[p].hash!=-1) {
if ((i==-1)&&(hash[p].hash==-2)) i=p;
if (hash[p].hash==h && strcmp(hash[p].key,ky)==0) {
//replace hash data for this key!
lastkeyptr=hash[p].key;
if (FREEDATA) (*fFreeProc)(hash[p].data);
hash[p].data = (void*) pdata;
return (OBJ*)hash[p].data;
#ifdef HASH_DBG_PRINT
GMessage("Add.R\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
return pdata;
}
p=(p+x)%fCapacity;
#ifdef HASH_DBG_PRINT
++iterations;
#endif
n--;
}
if(i==-1) i=p;
#ifdef HASH_DBG_PRINT
GMessage("Add.N\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
GTRACE(("GHash::insert: key=\"%s\"\n",ky));
//GMessage("GHash::insert: key=\"%s\"\n",ky);
GASSERT(0<=i && i<fCapacity);
GASSERT(hash[i].hash<0);
hash[i].hash=h;
hash[i].mark=mrk;
hash[i].mark=false;
hash[i].key=Gstrdup(ky);
hash[i].keyalloc=true;
lastkeyptr=hash[i].key;
Expand All @@ -238,12 +352,14 @@ template <class OBJ> const OBJ* GHash<OBJ>::Add(const char* ky,
return pdata;
}

template <class OBJ> const OBJ* GHash<OBJ>::shkAdd(const char* ky,
const OBJ* pdata,bool mrk){


template <class OBJ> OBJ* GHash<OBJ>::shkAdd(const char* ky,
OBJ* pdata,bool mrk){
register int p,i,x,h,n;
if(!ky) GError("GHash::insert: NULL key argument.\n");
GASSERT(fCount<fCapacity);
h=strhash(ky);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
Expand Down Expand Up @@ -281,11 +397,11 @@ template <class OBJ> const OBJ* GHash<OBJ>::shkAdd(const char* ky,


// Add or replace entry
template <class OBJ> OBJ* GHash<OBJ>::Replace(const char* ky,const OBJ* pdata, bool mrk){
template <class OBJ> OBJ* GHash<OBJ>::Replace(const char* ky, OBJ* pdata, bool mrk){
register int p,i,x,h,n;
if(!ky){ GError("GHash::replace: NULL key argument.\n"); }
GASSERT(fCount<fCapacity);
h=strhash(ky);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
Expand Down Expand Up @@ -328,7 +444,7 @@ template <class OBJ> OBJ* GHash<OBJ>::Remove(const char* ky){
if(!ky){ GError("GHash::remove: NULL key argument.\n"); }
OBJ* removed=NULL;
if(0<fCount){
h=strhash(ky);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
Expand Down Expand Up @@ -364,7 +480,7 @@ template <class OBJ> bool GHash<OBJ>::hasKey(const char* ky) {
register int p,x,h,n;
if(!ky){ GError("GHash::find: NULL key argument.\n"); }
if(0<fCount){
h=strhash(ky);
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
Expand All @@ -383,30 +499,45 @@ template <class OBJ> bool GHash<OBJ>::hasKey(const char* ky) {
return false;
}


template <class OBJ> OBJ* GHash<OBJ>::Find(const char* ky, char** keyptr){
register int p,x,h,n;
if(!ky){ GError("GHash::find: NULL key argument.\n"); }
if(0<fCount){
h=strhash(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
x=HASH2(h,fCapacity);
GASSERT(1<=x && x<fCapacity);
GASSERT(fCount<fCapacity);
n=fCapacity;
while(n && hash[p].hash!=-1){
if (fCount==0) return NULL;
h=GSTR_HASH(ky);
GASSERT(0<=h);
p=HASH1(h,fCapacity);
GASSERT(0<=p && p<fCapacity);
x=HASH2(h,fCapacity);
GASSERT(1<=x && x<fCapacity);
GASSERT(fCount<fCapacity);
n=fCapacity;
#ifdef HASH_DBG_PRINT
int iterations=0;
int init_p=p;
int init_x=x;
#endif
while(n && hash[p].hash!=-1){
if(hash[p].hash==h && strcmp(hash[p].key,ky)==0){
if (keyptr!=NULL) *keyptr = hash[p].key;
#ifdef HASH_DBG_PRINT
GMessage("Found \t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
return (OBJ*)hash[p].data;
}
p=(p+x)%fCapacity;
n--;
}
}
return NULL;
#ifdef HASH_DBG_PRINT
++iterations;
#endif
}

#ifdef HASH_DBG_PRINT
GMessage("Nfound\t%s\t%d,%d,%d\t%d\t%d\t%d\n",
ky, h,init_p,init_x, iterations, fCount, fCapacity);
#endif
return NULL;
}

template <class OBJ> void GHash<OBJ>::startIterate() {// initialize a key iterator; call
fCurrentEntry=0;
Expand Down
Loading

0 comments on commit 954b58c

Please sign in to comment.