Typical rolling hashfunction for anagrams
- using product of primes
- This will only work for relatively short patterns
- The hashvalues for allmost all normal words will fit into a 64 bit value without overflow.
- Based on this anagram matcher
/* braek; */
/* 'foobaroofzaqofom' */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
typedef unsigned long long HashVal;
static HashVal hashchar (unsigned char ch);
static HashVal hashmem (void *ptr, size_t len);
unsigned char primes26[] =
{ 5,71,79,19,2,83,31,43,11,53,37,23,41,3,13,73,101,17,29,7,59,47,61,97,89,67, };
/*********************************************/
static HashVal hashchar (unsigned char ch)
{
HashVal val=1;
if (ch >= 'A' && ch <= 'Z' ) val = primes26[ ch - 'A'];
else if (ch >= 'a' && ch <= 'z' ) val = primes26[ ch - 'a'];
return val;
}
static HashVal hashmem (void *ptr, size_t len)
{
size_t idx;
unsigned char *str = ptr;
HashVal val=1;
if (!len) return 0;
for (idx = 0; idx < len; idx++) {
val *= hashchar ( str[idx] );
}
return val;
}
/*********************************************/
unsigned char buff [4096];
int main (int argc, char **argv)
{
size_t patlen,len,pos,rotor;
int ch;
HashVal patval;
HashVal rothash=1;
patlen = strlen(argv[1]);
patval = hashmem( argv[1], patlen);
// fprintf(stderr, "Pat=%s, len=%zu, Hash=%llx\n", argv[1], patlen, patval);
for (rotor=pos=len =0; ; len++) {
ch=getc(stdin);
if (ch == EOF) break;
if (ch < 'A' || ch > 'z') { pos = 0; rothash = 1; continue; }
if (ch > 'Z' && ch < 'a') { pos = 0; rothash = 1; continue; }
/* remove old char from rolling hash */
if (pos >= patlen) { rothash /= hashchar(buff[rotor]); }
/* add new char to rolling hash */
buff[rotor] = ch;
rothash *= hashchar(buff[rotor]);
// fprintf(stderr, "%zu: [rot=%zu]pos=%zu, Hash=%llx\n", len, rotor, pos, rothash);
rotor = (rotor+1) % patlen;
/* matched enough characters ? */
if (++pos < patlen) continue;
/* correct hash value ? */
if (rothash != patval) continue;
fprintf(stdout, "Pos=%zu\n", len);
}
return 0;
}
Output/result:
$ ./a.out foo < anascan.c
Pos=21
Pos=27
Pos=33
Update. For people who don't like product of primes, here is a taxinumber sum of cubes (+ additional histogram check) implementation. This is also supposed to be 8-bit clean. Note the cubes are not necessary; it wotks equally well with squares. Or just the sum. (the final histogram check will have some more work todo)
/* braek; */
/* 'foobaroofzaqofom' */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
typedef unsigned long long HashVal;
static HashVal hashchar (unsigned char ch);
static HashVal hashmem (void *ptr, size_t len);
/*********************************************/
static HashVal hashchar (unsigned char ch)
{
HashVal val=1+ch;
return val*val*val;
}
static HashVal hashmem (void *ptr, size_t len)
{
size_t idx;
unsigned char *str = ptr;
HashVal val=1;
if (!len) return 0;
for (idx = 0; idx < len; idx++) {
val += hashchar ( str[idx] );
}
return val;
}
/*********************************************/
int main (int argc, char **argv)
{
size_t patlen,len,rotor;
int ch;
HashVal patval;
HashVal rothash=1;
unsigned char *patstr;
unsigned pathist[256] = {0};
unsigned rothist[256] = {0};
unsigned char cycbuff[1024];
patstr = (unsigned char*) argv[1];
patlen = strlen((const char*) patstr);
patval = hashmem( patstr, patlen);
for(rotor=0; rotor < patlen; rotor++) {
pathist [ patstr[rotor] ] += 1;
}
fprintf(stderr, "Pat=%s, len=%zu, Hash=%llx\n", argv[1], patlen, patval);
for (rotor=len =0; ; len++) {
ch=getc(stdin);
if (ch == EOF) break;
/* remove old char from rolling hash */
if (len >= patlen) {
rothash -= hashchar(cycbuff[rotor]);
rothist [ cycbuff[rotor] ] -= 1;
}
/* add new char to rolling hash */
cycbuff[rotor] = ch;
rothash += hashchar(cycbuff[rotor]);
rothist [ cycbuff[rotor] ] += 1;
// fprintf(stderr, "%zu: [rot=%zu], Hash=%llx\n", len, rotor, rothash);
rotor = (rotor+1) % patlen;
/* matched enough characters ? */
if (len < patlen) continue;
/* correct hash value ? */
if (rothash != patval) continue;
/* correct histogram? */
if (memcmp(rothist,pathist, sizeof pathist)) continue;
fprintf(stdout, "Pos=%zu\n", len-patlen);
}
return 0;
}