/**
Create a new node using the given rule id and text string
*/
-static ac_node *new_node(void *user_data, const char *text)
+static ac_node *new_node_strlen(void *user_data, const char *text, size_t textlen)
{
ac_node *node = (ac_node *)malloc(sizeof(ac_node));
memset(node, 0, sizeof(ac_node));
- node->branch = strdup(text);
+ if (textlen > 0) {
+ node->branch = (char *)malloc(textlen);
+ memcpy(node->branch, text, textlen);
+ node->brlen = (int)textlen;
+ }
node->user_data = user_data;
- node->is_match = TRUE;
+ node->checked = FALSE;
+ node->terminal = TRUE;
return node;
}
+static ac_node *new_node(void *user_data, const char *text)
+{
+ return new_node_strlen(user_data, text, strlen(text));
+}
+
/**
Append child (leaf) node to parent at position idx
*/
/**
Compare string s1 with string s2
return value:
- 0 => s1 before s2
- -1 => s1 after s2
- other values => Number of initial letters in common
+ 0 => s1 before s2
+ -1 => s1 after s2
+ other values => Number of initial letters in common
+#ifdef CASE_INSENSITIVE
+ case conversion (tolower) should be done before calling compare()
+#endif
*/
-static int compare(const char *s1, const char *s2)
+static int compare(const char *s1, size_t s1len, const char *s2, size_t s2len)
{
int i;
- size_t commonlen, s2len;
-
- if (strlen(s1) == 0)
- return 0;
+ size_t commonlen;
- if (strlen(s2) == 0)
- return -1;
-
-#ifdef CASE_INSENSITIVE
- if (tolower(s1[0]) < tolower(s2[0]))
+ if (s1len == 0)
return 0;
- if (tolower(s2[0]) < tolower(s1[0]))
+ if (s2len == 0)
return -1;
-#else
if (s1[0] < s2[0])
return 0;
if (s2[0] < s1[0])
return -1;
-#endif
- commonlen = strlen(s1);
- if ((s2len = strlen(s2)) < commonlen)
+ commonlen = s1len;
+ if (s2len < commonlen)
commonlen = s2len;
for (i = 1; i < commonlen; i++) {
-#ifdef CASE_INSENSITIVE
- if (tolower(s1[i]) != tolower(s2[i]))
- return i;
-#else
if (s1[i] != s2[i])
return i;
-#endif
}
return commonlen;
}
{
ac_match *match = (ac_match *)malloc(sizeof(ac_match));
- if(match) {
+ if (match) {
match->user_data = user_data;
match->position = pos;
match->size = size;
node->nsposns++;
node->sposns = (int *)malloc(node->nsposns * sizeof(int));
- if(node->sposns) {
+ if (node->sposns) {
node->sposns[0] = offs;
for (i = 1; i < node->nsposns; i++)
ancestors_decrement_sposns(node, nbad);
node->nsposns -= nbad;
- if (node->nsposns == 0) {
+ if (node->nsposns == 0)
node->sposns = NULL;
- } else {
+ else {
node->sposns = (int *)malloc(node->nsposns * sizeof(int));
for (i = 0; i < node->nsposns; i++) {
*/
static int get_totlen(ac_node *node)
{
- size_t len = strlen(node->branch);
- while (node->stem != NULL) {
- node = node->stem;
- len += strlen(node->branch);
- }
- return (int)len;
+ int len = node->brlen;
+ if (node->stem != NULL)
+ len += get_totlen(node->stem);
+ return len;
+}
+
+#ifdef CASE_INSENSITIVE
+/**
+ convert string to all lower case. Result should be freed after use.
+*/
+static char *string_tolower(const char *string, size_t len)
+{
+ int i;
+ char *tl_string = (char *)malloc(len + 1);
+ if (!tl_string) return NULL;
+ for (i = 0; i < len; i++)
+ tl_string[i] = (char)tolower(string[i]);
+ tl_string[len] = 0;
+ return tl_string;
}
+#endif
void add_node(ac_instance *ac, const char *string, void *user_data)
{
- char *ptr = (char *)string;
+ char *ptr, *tmpptr;
+ size_t stlen = strlen(string);
ac_node *node = ac->root, *xnode;
- int finished = 0;
+ boolean finished = FALSE, override_finished;
int res;
int i = 0;
+#ifdef CASE_INSENSITIVE
+ char *xstring = string_tolower(string, stlen);
+ if (xstring == NULL) return;
+ ptr = xstring;
+#else
+ ptr = (char *)string;
+#endif
+
while (!finished) {
+ override_finished = FALSE;
if (node != NULL && node->leaves != NULL) {
/* starting with the root node, we look at each leaf in turn */
for (i = 0; (xnode = node->leaves[i]) != NULL; i++) {
* - if common is l.t branch, common part becomes node, and branch and node become children
* c) xnode->branch > text, insert node before xnode */
- res = compare(xnode->branch, ptr);
+ res = compare(xnode->branch, xnode->brlen, ptr, stlen);
if (res == 0)
/* xnode before new node */
} else {
ac_node *child1;
/* res letters were in common */
- if (res == strlen(xnode->branch)) {
+ node = xnode; // consider only this child node
+ if (res == node->brlen) {
/* all letters in common, so now we check the leaves */
- node = xnode;
ptr += res;
- if (strlen(ptr) == 0) {
+ stlen = strlen(ptr);
+ if (stlen == 0) {
/* duplicate string */
node->user_data = user_data;
- node->is_match = TRUE;
+ node->terminal = TRUE;
+#ifdef CASE_INSENSITIVE
+ free(xstring);
+#endif
return;
}
/* leave for() loop and continue in while (!finished) */
- i = 0;
+ if (node->leaves != NULL) {
+ // if it also has child leaves, override setting of finished
+ override_finished = TRUE;
+ } else i = 0; // otherwise add us as the first child
break;
}
/* the common part was l.t. all
* the common part becomes the new node
* we add both this node with all its children and the new node as children */
- child1 = new_node(xnode->user_data, (const char *)(xnode->branch + res));
+ child1 = new_node_strlen(xnode->user_data, (const char *)(xnode->branch + res), xnode->brlen - res);
child1->leaves = xnode->leaves;
child1->stem = xnode;
+ child1->terminal = xnode->terminal;
if (child1->leaves != NULL) {
for (i = 0; (node = child1->leaves[i]) != NULL; i++) {
/* adjust stems for each of these */
}
}
/* remove xnodes leaves and shorten its text */
- memset(xnode->branch + res, 0, 1);
+ tmpptr = (char *)malloc(res);
+ memcpy(tmpptr, xnode->branch, res);
+ free(xnode->branch);
+ xnode->branch = tmpptr;
+ xnode->brlen = res;
- if (strlen(ptr) > res) {
+ if (stlen > res) {
ac_node *child2;
/* remainder of new string is added as child2 */
- xnode->is_match = FALSE;
+ xnode->terminal = FALSE;
xnode->leaves = (ac_node **)malloc(3 * sizeof(ac_node *));
if (xnode->leaves != NULL) {
- child2 = new_node(user_data, (const char *)(ptr + res));
- ac->added++;
- child2->stem = xnode;
-
- res = compare(child1->branch, child2->branch);
- if (res == 0) {
- /* child1 before child2 */
- xnode->leaves[0] = child1;
- xnode->leaves[1] = child2;
- } else {
- /* child1 before child2 */
- xnode->leaves[0] = child2;
- xnode->leaves[1] = child1;
- }
- xnode->leaves[2] = NULL;
+ child2 = new_node(user_data, (const char *)(ptr + res));
+ ac->added++;
+ child2->stem = xnode;
+
+ res = compare(child1->branch, child1->brlen, child2->branch, child2->brlen);
+ if (res == 0) {
+ /* child1 before child2 */
+ xnode->leaves[0] = child1;
+ xnode->leaves[1] = child2;
+ } else {
+ /* child1 before child2 */
+ xnode->leaves[0] = child2;
+ xnode->leaves[1] = child1;
+ }
+ xnode->leaves[2] = NULL;
}
} else {
/* the common part consumed all of new string, so we only have 1 child to add */
+ xnode->terminal = TRUE;
xnode->leaves = (ac_node **)malloc(2 * sizeof(ac_node *));
if (xnode->leaves != NULL) {
- xnode->user_data = user_data;
- xnode->leaves[0] = child1;
- xnode->leaves[1] = NULL;
+ xnode->user_data = user_data;
+ xnode->leaves[0] = child1;
+ xnode->leaves[1] = NULL;
}
}
}
+#ifdef CASE_INSENSITIVE
+ free(xstring);
+#endif
return;
}
}
- finished = 1;
+ if (!override_finished) finished = TRUE;
}
/* we checked all children and none were after ptr */
if (ac->root == NULL) {
/* new root - make a root node and add us as a leaf */
node = ac->root = new_node(NULL, "");
- ac->root->is_match = FALSE;
+ ac->root->terminal = FALSE;
}
add_child(node, i, xnode);
+#ifdef CASE_INSENSITIVE
+ free(xstring);
+#endif
}
/**
* @callgraph
*/
-ac_match *parse_char(char ch, int offs, ac_node *node)
+static ac_match *parse_char_with_case(unsigned char ch, int offs, ac_node *node)
{
ac_match *acm = NULL;
ac_node *xnode;
- size_t brlen;
int child_sposns = 0;
+ int sposn, totlen;
int i, j;
- if (node->checked)
- return NULL;
+ if (!node || node->checked) return NULL;
- brlen = strlen(node->branch);
- if (brlen > 0) {
-#ifdef CASE_INSENSITIVE
- if (node->stem->stem == NULL && tolower(node->branch[0]) == tolower(ch)) {
- /* leaf of root node, and first char matched, start a new sposn */
- add_sposn(node, offs);
+ if (node->leaves != NULL && (node->brlen == 0 || node->child_sposns != 0)) {
+ /* starting from the root, we look at each unchecked child in turn and check (recursively) for matches
+ * (skip if there are no child_sposns, to avoid parsing the whole tree) */
+ for (i = 0; (xnode = node->leaves[i]) != NULL && !xnode->checked; i++) {
+ if (node->brlen == 0 || xnode->nsposns > 0 || xnode->child_sposns > 0) {
+ acm = parse_char_with_case(ch, offs, xnode);
+ if (acm != NULL)
+ return acm;
+ }
}
-#else
- if (node->stem->stem == NULL && node->branch[0] == ch) {
+ }
+
+ /* after checking the child nodes, we look at the node itself */
+
+ if (node->brlen > 0) {
+ /* the cast is just to prevent SVACE warnings, as long as both types are the same it does not matter */
+ if (node->stem->stem == NULL && (unsigned char)node->branch[0] == ch) {
/* leaf of root node, and first char matched, start a new sposn */
add_sposn(node, offs);
}
-#endif
+
if (node->nsposns > 0) {
/* first check the ongoing matches in sposns */
for (i = 0; i < node->nsposns; i++) {
- if (node->sposns[i] > offs)
- continue;
-
-#ifdef CASE_INSENSITIVE
- if (tolower(node->branch[offs - node->sposns[i]]) != tolower(ch)) {
- /* mark for removal */
+ sposn = node->sposns[i];
+ /* the cast is just to prevent SVACE warnings, as long as both types are the same it does not matter */
+ if ((unsigned char)node->branch[offs - sposn] != ch) {
+ /* mismatch, mark for removal */
node->sposns[i] = -1;
- } else {
-#else
- if (node->branch[offs - node->sposns[i]] != ch) {
- /* mark for removal */
- node->sposns[i] = -1;
- } else {
-#endif
- /* continuing match */
- if (offs - node->sposns[i] == brlen - 1) {
- /* the entirety of this branch was matched */
- if (node->leaves == NULL || node->is_match) {
- /* we got a match ! */
- int totlen = get_totlen(node);
- acm = create_match(node->user_data, node->sposns[i] - totlen + brlen, totlen);
+ } else if (offs - sposn == node->brlen - 1) {
+ /* the entirety of this branch was matched */
+ if (node->terminal) {
+ /* we got a match ! */
+ totlen = get_totlen(node);
+ if (acm == NULL) {
+ // since the values of node->sposns[] are unique, acm should always be NULL
+ // the check is added to prevent SVACE warnings
+ acm = create_match(node->user_data, sposn - totlen + node->brlen, totlen);
}
- if (node->leaves != NULL) {
- /* otherwise place possible matches at next char for all children */
- for (j = 0; (xnode = node->leaves[j]) != NULL; j++) {
- add_sposn(xnode, offs + 1);
- child_sposns++;
- }
+ }
+ if (node->leaves != NULL) {
+ /* place possible matches at next char for all children */
+ for (j = 0; (xnode = node->leaves[j]) != NULL; j++) {
+ add_sposn(xnode, offs + 1);
+ child_sposns++;
}
- /* mark for removal */
- node->sposns[i] = -1;
+ /* increase child sposn count for node and all ancestors */
+ ancestors_increment_sposns(node, child_sposns);
}
+ /* mark for removal */
+ node->sposns[i] = -1;
}
}
- }
-
- /* after checking all sposns we clean up bad matches */
- clean_bad_sposns(node);
- }
-
- if (acm == NULL) {
- if (node->leaves != NULL && (brlen == 0 || node->child_sposns != 0)) {
- /* starting from the root, we look at each leaf in turn and check for matches
- * (skip if there are no child_sposns, to avoid parsing the whole tree) */
- for (i = 0; (xnode = node->leaves[i]) != NULL; i++) {
- acm = parse_char(ch, offs, xnode);
- if (acm != NULL)
- return acm;
- }
+ /* after checking all sposns we clean up bad matches */
+ clean_bad_sposns(node);
}
- }
- /* increase child sposn count for node and all ancestors */
- if (child_sposns != 0)
- ancestors_increment_sposns(node, child_sposns);
+ /* mark this node as checked for this position */
+ node->checked = TRUE;
+ }
- /* mark this node as checked for this position */
- node->checked = 1;
return acm;
}
+/**
+ * @callgraph
+ */
+ac_match *parse_char(unsigned char ch, int offs, ac_node *node)
+{
+#ifdef CASE_INSENSITIVE
+ return parse_char_with_case(tolower(ch), offs, node);
+#else
+ return parse_char_with_case(ch, offs, node);
+#endif
+}
+
void reset_checks(ac_node *node)
{
int i;
- node->checked = 0;
+ if (!node) return;
+ node->checked = FALSE;
if (node->leaves != NULL) {
ac_node *xnode;
free(node);
}
-void dump_tree(ac_node *node)
+void dump_tree(ac_node *node, FILE *stream)
{
int i;
ac_node *xnode;
- printf("%s (%p)", node->branch, node->user_data);
+ int otabbing, xtabbing;
+ static int tabbing;
+
+ if (!node) return;
+
+ if (node->branch == NULL || node->stem->brlen == 0)
+ tabbing = 0;
+
+ for (i = 0; i < tabbing; i++)
+ fprintf(stream, " ");
+
+ otabbing = tabbing;
+
+ if (node->brlen > 0) {
+ char *word = malloc(node->brlen + 1);
+ memcpy(word, node->branch, node->brlen);
+ memset(word + node->brlen, 0, 1);
+ fprintf(stream, "%s (%p)", word, node->user_data);
+ free(word);
+ tabbing += node->brlen + 10;
+ if (node->terminal) fprintf(stream, " |");
+ }
if (node->leaves == NULL) {
- printf("\n");
+ fprintf(stream, "\n");
return;
}
- printf(" ");
+ fprintf(stream, "->\n");
+
+ xtabbing = tabbing;
+
+ for (i = 0; (xnode = node->leaves[i]) != NULL; i++) {
+ dump_tree(xnode, stream);
+ tabbing = xtabbing;
+ }
- for (i = 0; (xnode = node->leaves[i]) != NULL; i++)
- dump_tree(xnode);
+ tabbing = otabbing;
}
void clear_sposns(ac_node *node)
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <privacy_guard_client.h>
+#include <privacy_guard_client_internal.h>
+#include <time.h>
+
+#include "privacy_guard_dlp.h"
+#include "../ahocorasick/ahocorasick.h"
+#include "../ahocorasick/node.h"
+
+#define BRIGHTNESS 0
+#define RED 31
+#define GREEN 32
+#define YELLOW 33
+#define BG_BLACK 40
+
+static int fail_cnt = 0;
+static int success_cnt = 0;
+
+/* scenario settings */
+
+/** Test aho-corasick algorithm.
+ * Instructions: uncomment exactly ONE of the scenarios below. Set the number of target words (NWORDS).
+ * Adjust the other settings as required.
+ */
+
+/* only uncomment ONE of these at a time */
+#define SCENARIO_1 // pull words from text, parse same text
+//#define SCENARIO_2 // pull words from text, parse different text
+//#define SCENARIO_3 // pull words from text, parse random text
+//#define SCENARIO_4 // create random words, parse random text
+
+#define NWORDS 1000 // target number of words to search for
+
+#define WORD_SRCFILE "/tmp/smatch.txt"
+#define TEXT_SRCFILE "/tmp/smatch2.txt"
+
+#define SMATCH_BUFSIZE 65536 // text packet buffer size
+
+//#define SHOW_ADDED // uncomment to show strings as they are added
+#define DUMP_TREE // uncomment to dump the tree after adding strings
+//#define SHOW_MATCHES // uncomment to show details of each match
+
+///////////////////////////////////////////////////////////////////////////////
+// test utilities (aho-corasick)
+///////////////////////////////////////////////////////////////////////////////
+
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+
+static int fastrand(int max)
+{
+ static int next = 0;
+ next = next * 1103515245 + 12345;
+ return ((unsigned)(next / 65536) % max);
+}
+
+static void __change_color_to_red(void)
+{
+ printf("%c[%d;%dm", 0x1B, BRIGHTNESS, RED);
+}
+
+static void __change_color_to_green(void)
+{
+ printf("%c[%d;%dm", 0x1B, BRIGHTNESS, GREEN);
+}
+
+static void __change_color_to_yellow(void)
+{
+ printf("%c[%d;%dm", 0x1B, BRIGHTNESS, YELLOW);
+}
+
+static void __change_color_to_origin(void)
+{
+ printf("%c[%dm", 0x1B, 0);
+}
+
+static void __start_test(const char *function_name)
+{
+ __change_color_to_yellow();
+ printf("================================================================================\n");
+ printf("\t%s\n", function_name);
+ printf("================================================================================\n");
+ __change_color_to_origin();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// test verfication utility
+///////////////////////////////////////////////////////////////////////////////
+
+// Test String Match (ahocorasick)
+static void __test_string_match()
+{
+ __start_test(__FUNCTION__);
+ unsigned int i = 0;
+ unsigned long stime;
+
+ printf("Search match: ");
+
+ int awl = 0;
+ char buf[SMATCH_BUFSIZE];
+
+#ifndef SCENARIO_4
+ // load text from file
+ int fd = open(WORD_SRCFILE, O_RDONLY);
+ if (fd < 0) {
+ printf("Could not open " WORD_SRCFILE ". It should be a text file for me to find source words.\n");
+ return;
+ }
+ ssize_t tsize = read(fd, buf, SMATCH_BUFSIZE - 1);
+ if (tsize <= 0) {
+ printf("Could not read from " WORD_SRCFILE "\n");
+ return;
+ }
+ close(fd);
+ memset(buf + tsize, 0, 1);
+
+ // count words in text
+ int nwords = 1;
+
+ for (i = 0; i < tsize; i++) {
+ if (buf[i] == ' ') nwords++;
+ }
+#endif
+
+ int res = 0, nres;
+ ac_instance *ac = ahocorasick_init();
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ stime = tv.tv_sec * 1000000 + tv.tv_usec;
+
+ for (i = 1; i <= NWORDS; i++) {
+#ifdef SCENARIO_4
+ int j;
+ char *word = malloc(8);
+ for (j = 0; j < 7; j++) word[j] = (char)('a' + fastrand(20));
+ word[7] = 0;
+#else
+ // get a random number between 1 and nwords
+ int start = -1, j;
+ int idxword = fastrand(nwords - 1) + 1;
+
+ for (j = 0; idxword > 0; j++) {
+ if (idxword == 1 && start == -1) start = j;
+ if (buf[j] == ' ') idxword--;
+ }
+ if (j - start < 6) {
+ i--;
+ continue;
+ }
+
+ char *word = strndup(buf + start, j - start - 1);
+#endif
+ awl += strlen(word);
+ nres = ahocorasick_add_string(ac, word, (void *)i);
+#ifdef SHOW_ADDED
+ printf("Added: .%s. %d\n", word, strlen(word));
+#endif
+ res = nres;
+ free(word);
+ }
+
+ awl = awl / i;
+
+ gettimeofday(&tv, NULL);
+ stime = tv.tv_sec * 1000000 + tv.tv_usec - stime;
+ printf("Build tree took %.2f msec for %d words (%d unique), average word len was %d\n", stime / 1000., NWORDS, res, awl);
+ stime = tv.tv_sec * 1000000 + tv.tv_usec;
+
+#ifdef SCENARIO_2
+ // load alternate text
+ fd = open(TEXT_SRCFILE, O_RDONLY);
+ if (fd < 0) {
+ printf("Could not open " TEXT_SRCFILE ". It should be a related text file for me to search for matches.\n");
+ return;
+ }
+ tsize = read(fd, buf, SMATCH_BUFSIZE - 1);
+ if (tsize <= 0) {
+ printf("Could not read from " TEXT_SRCFILE "\n");
+ return;
+ }
+ close(fd);
+ memset(buf + tsize, 0, 1);
+#endif
+
+#if defined SCENARIO_3 || defined SCENARIO_4
+ for (i = 0; i < 32768; i++)
+ buf[i] = (char)fastrand(100) + 30;
+ buf[32768] = 0;
+
+#endif
+ const char *text = buf;
+ ac_match *match;
+
+#ifdef DUMP_TREE
+ dump_tree(ac->root, stdout);
+#endif
+
+ ahocorasick_set_text(ac, text, strlen(text), 0);
+
+ res = 0;
+ do {
+ match = ahocorasick_find_next(ac);
+
+ if (match != NULL) {
+#ifdef SHOW_MATCHES
+ for (i = match->position; i < match->position + match->size; i++) printf("%c", text[i]);
+ printf("\n");
+ printf("got match for %p at %d len %d\n", match->user_data, match->position, match->size);
+#endif
+ res++;
+ }
+ } while (match != NULL);
+
+ gettimeofday(&tv, NULL);
+ stime = tv.tv_sec * 1000000 + tv.tv_usec - stime;
+ printf("Parsing took %.2f msec\n", stime / 1000.);
+
+ printf("Got a total of %d matches.\n", res);
+
+ ahocorasick_free(ac);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Main
+//////////////////////////////////////////////////////////////////////////
+int main()
+{
+ __change_color_to_green();
+ printf("DLP Test Start\n");
+ __change_color_to_origin();
+
+ /////////////////////////////////////////////////////////////////////////
+ // Test String Match (ahocorasick)
+ __test_string_match();
+
+ //////////////////////////////////////////////////////////////////////////
+
+ __change_color_to_green();
+
+ printf("Test Complete\n");
+ printf("success : %d, ", success_cnt);
+ __change_color_to_red();
+ printf("fail : %d\n", fail_cnt);
+ __change_color_to_origin();
+
+ return 0;
+}