#include #include #include #include #ifndef NULL #define NULL (void *)0 #endif #define ALPHABET_LEN 128 #define max(a, b) ((a < b) ? b : a) /** // BAD CHARACTER RULE. // delta1 table: delta1[c] contains the distance between the last // character of pat and the rightmost occurrence of c in pat. // // If c does not occur in pat, then delta1[c] = patlen. // If c is at string[i] and c != pat[patlen-1], we can safely shift i // over by delta1[c], which is the minimum distance needed to shift // pat forward to get string[i] lined up with some character in pat. // c == pat[patlen-1] returning zero is only a concern for BMH, which // does not have delta2. BMH makes the value patlen in such a case. // We follow this choice instead of the original 0 because it skips // more. (correctness?) // // This algorithm runs in alphabet_len+patlen time. * @param delta1 -- the array to be filled * @param patlen -- the length of the pattern being searched for. * @param par -- the string being searched for **/ void make_delta1(int *delta1, int patlen, char pat[patlen]) { for (int i=0; i < ALPHABET_LEN; i++) { delta1[i] = patlen; } for (int i=0; i < patlen-2; i++) { delta1[pat[i]] = patlen - 1 - i; } } /** * @param wordlen the length of the thing being considered * @param word the thing being considered * @param pos -- position in the word being considered (pos < wordlen) * @return true if the suffix of word starting from word[pos] is a prefix of word * **/ int is_prefix(int wordlen, char word[wordlen], int pos) { int suffixlen = wordlen - pos; // could also use the strncmp() library function here // return ! strncmp(word, &word[pos], suffixlen); for (int i = 0; i < suffixlen; i++) { if (word[i] != word[pos + i]) { return 0; } } return 1; } /** * @param wordlen the length of the thing being considered * @param word the thing being considered * @param pos -- position in the word being considered (pos < wordlen) * @return length of the longest suffix of word ending on word[pos]. * suffix_length(8m "dddbcabc", 4) = 2 * **/ int suffix_length(int wordlen, char word[wordlen], int pos) { int i; // increment suffix length i to the first mismatch or beginning // of the word for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++); return i; } /** // GOOD SUFFIX RULE. // delta2 table: given a mismatch at pat[pos], we want to align // with the next possible full match could be based on what we // know about pat[pos+1] to pat[patlen-1]. // // In case 1: // pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat, // the next plausible match starts at or after the mismatch. // If, within the substring pat[pos+1 .. patlen-1], lies a prefix // of pat, the next plausible match is here (if there are multiple // prefixes in the substring, pick the longest). Otherwise, the // next plausible match starts past the character aligned with // pat[patlen-1]. // // In case 2: // pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The // mismatch tells us that we are not looking at the end of a match. // We may, however, be looking at the middle of a match. // // The first loop, which takes care of case 1, is analogous to // the KMP table, adapted for a 'backwards' scan order with the // additional restriction that the substrings it considers as // potential prefixes are all suffixes. In the worst case scenario // pat consists of the same letter repeated, so every suffix is // a prefix. This loop alone is not sufficient, however: // Suppose that pat is "ABYXCDBYX", and text is ".....ABYXCDEYX". // We will match X, Y, and find B != E. There is no prefix of pat // in the suffix "YX", so the first loop tells us to skip forward // by 9 characters. // Although superficially similar to the KMP table, the KMP table // relies on information about the beginning of the partial match // that the BM algorithm does not have. // // The second loop addresses case 2. Since suffix_length may not be // unique, we want to take the minimum value, which will tell us // how far away the closest potential match is. * * @param delta2 -- the array to be filled * @param patlen -- the length of the pattern being searched for. * @param par -- the string being searched for **/ void make_delta2(int *delta2, int patlen, char pat[patlen]) { int p; int last_prefix_index = patlen-1; // first loop for (p=patlen-1; p>=0; p--) { if (is_prefix(patlen, pat, p+1)) { last_prefix_index = p + 1; // NOTE this is true the first time so last_p_i actually starts at patlen because the first time we a looking at a null suffix } delta2[p] = last_prefix_index + (patlen - 1 - p); } // second loop for (p=0; p < patlen-1; p++) { int slen = suffix_length(patlen, pat, p); if (pat[p - slen] != pat[patlen - 1 - slen]) { delta2[patlen-1 - slen] = patlen-1 - p + slen; } } } /** * Perfom a boyer-moore search for the patter given by needle int he string given by haystack * @paran patlen -- the length of the string needle * @param needle -- the string to be searched for * @param stringlen -- the length of the haystack * @param haystack -- the stirng to be searched * @param delta1 -- the boyer-moore delta one array * @param delta2 -- the boyer-moore delta2 array * @return the first location of needle in haystack, or -1 is needle cannot be found. * **/ int boyer_moore (int patlen, char needle[patlen], int stringlen, char haystack[stringlen], int delta1[], int delta2[]) { long stepp = 0; // The empty pattern must be considered specially if (patlen == 0) { return 0; } int pi = -1; int i = patlen - 1; // str-idx while (i < stringlen) { int j = patlen - 1; // pat-idx #ifdef ONCE printf("C: %d %d\n", i, j); #endif while (j >= 0 && (haystack[i] == needle[j])) { #ifdef ONCE printf("M: %d %d\n", i, j); #endif --i; --j; } if (j < 0) { return i+1; } #ifdef ONCE printf("SHIFTING: %d %c %d %d\n", i, haystack[i], delta1[haystack[i]], delta2[j]); #endif int shift = max(delta1[haystack[i]], delta2[j]); /** if ((i+shift)<=pi) { printf("%d Possible LOOP DETECTED %d %d , %c %d\n", stepp, (i+shift), pi, haystack[i], delta1[haystack[i]]); printf("needle: %s %c %c %c %c %c\n", needle, haystack[i], haystack[i+1], haystack[i+2], haystack[i+3], haystack[i+4]); for (int i = 0; i < ALPHABET_LEN; i++) { if (delta1[i]