Commit 9aa3dbbf authored by 's avatar
Browse files

Added functions for Aho-Corasick Multi-Pattern Search Engine and

implemented the option for using this algorithm trackers that use 
multiple strings.

errors.mapi: added an error for no memory.


git-svn-id: file:///home/svn/mapi/trunk@750 8d5bb341-7cf1-0310-8cf6-ba355fef3186
parent 335d4ff2
## Process this file with automake to produce Makefile.in
AM_CPPFLAGS = -I.. -I../lib -I../drivers -D_GNU_SOURCE -D_THREAD_SAFE
noinst_LTLIBRARIES = libflist.la libfhelp.la libmapiipc.la libparseconf.la libprintfstring.la libmapilibhandler.la libmsearch.la
noinst_LTLIBRARIES = libflist.la libfhelp.la libmapiipc.la libparseconf.la libprintfstring.la libmapilibhandler.la libmsearch.la libacsmx2.la
libflist_la_SOURCES = flist.c flist.h
libflist_la_DEPENDENCIES = mapi_errors.h
libfhelp_la_SOURCES = fhelp.c fhelp.h
......@@ -10,6 +10,7 @@ libparseconf_la_SOURCES = parseconf.c parseconf.h
libprintfstring_la_SOURCES = printfstring.c printfstring.h
libmapilibhandler_la_SOURCES = mapilibhandler.c mapilibhandler.h
libmsearch_la_SOURCES = mstring.c mstring.h
libacsmx2_la_SOURCES = acsmx2.c acsmx2.h
noinst_HEADERS = mapi_errors.h
......
/*
** $Id$
**
** acsmx2.c
**
** Multi-Pattern Search Engine
**
** Aho-Corasick State Machine - version 2.0
**
** Supports both Non-Deterministic and Deterministic Finite Automata
**
**
** Reference - Efficient String matching: An Aid to Bibliographic Search
** Alfred V Aho and Margaret J Corasick
** Bell Labratories
** Copyright(C) 1975 Association for Computing Machinery,Inc
**
** +++
** +++ Version 1.0 notes - Marc Norton:
** +++
**
** Original implementation based on the 4 algorithms in the paper by Aho & Corasick,
** some implementation ideas from 'Practical Algorithms in C', and some
** of my own.
**
** 1) Finds all occurrences of all patterns within a text.
**
** +++
** +++ Version 2.0 Notes - Marc Norton/Dan Roelker:
** +++
**
** New implementation modifies the state table storage and access model to use
** compacted sparse vector storage. Dan Roelker and I hammered this strategy out
** amongst many others in order to reduce memory usage and improve caching performance.
** The memory usage is greatly reduced, we only use 1/4 of what we use to. The caching
** performance is better in pure benchmarking tests, but does not show overall improvement
** in Snort. Unfortunately, once a pattern match test has been performed Snort moves on to doing
** many other things before we get back to a patteren match test, so the cache is voided.
**
** This versions has better caching performance characteristics, reduced memory,
** more state table storage options, and requires no a priori case conversions.
** It does maintain the same public interface. (Snort only used banded storage).
**
** 1) Supports NFA and DFA state machines, and basic keyword state machines
** 2) Initial transition table uses Linked Lists
** 3) Improved state table memory options. NFA and DFA state
** transition tables are converted to one of 4 formats during compilation.
** a) Full matrix
** b) Sparse matrix
** c) Banded matrix (Default-this is the only one used in snort)
** d) Sparse-Banded matrix
** 4) Added support for acstate_t in .h file so we can compile states as
** 16, or 32 bit state values for another reduction in memory consumption,
** smaller states allows more of the state table to be cached, and improves
** performance on x86-P4. Your mileage may vary, especially on risc systems.
** 5) Added a bool to each state transition list to indicate if there is a matching
** pattern in the state. This prevents us from accessing another data array
** and can improve caching/performance.
** 6) The search functions are very sensitive, don't change them without extensive testing,
** or you'll just spoil the caching and prefetching opportunities.
**
** Extras for fellow pattern matchers:
** The table below explains the storage format used at each step.
** You can use an NFA or DFA to match with, the NFA is slower but tiny - set the structure directly.
** You can use any of the 4 storage modes above -full,sparse,banded,sparse-bands, set the structure directly.
** For applications where you have lots of data and a pattern set to search, this version was up to 3x faster
** than the previous verion, due to caching performance. This cannot be fully realized in Snort yet,
** but other applications may have better caching opportunities.
** Snort only needs to use the banded or full storage.
**
** Transition table format at each processing stage.
** -------------------------------------------------
** Patterns -> Keyword State Table (List)
** Keyword State Table -> NFA (List)
** NFA -> DFA (List)
** DFA (List)-> Sparse Rows O(m-avg # transitions per state)
** -> Banded Rows O(1)
** -> Sparse-Banded Rows O(nb-# bands)
** -> Full Matrix O(1)
**
** Copyright(C) 2002,2003,2004 Marc Norton
** Copyright(C) 2003,2004 Daniel Roelker
** Copyright(C) 2002,2003,2004 Sourcefire,Inc.
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "acsmx2.h"
/*
*
*/
#define MEMASSERT(p,s) if(!p){printf("ACSM-No Memory: %s!\n",s);exit(0);}
/*
*
*/
static int max_memory = 0;
/*
*
*/
static int s_verbose=0;
/*
*
*/
typedef struct acsm_summary_s
{
unsigned num_states;
unsigned num_transitions;
ACSM_STRUCT2 acsm;
}acsm_summary_t;
/*
*
*/
static acsm_summary_t summary={0,0};
/*
** Case Translation Table
*/
static unsigned char xlatcase[256];
/*
*
*/
static
void
init_xlatcase()
{
int i;
for (i = 0; i < 256; i++)
{
xlatcase[i] = toupper(i);
}
}
/*
* Case Conversion
*/
static
inline
void
ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
{
int i;
#ifdef XXXX
int n;
n = m & 3;
m >>= 2;
for (i = 0; i < m; i++ )
{
d[0] = xlatcase[ s[0] ];
d[2] = xlatcase[ s[2] ];
d[1] = xlatcase[ s[1] ];
d[3] = xlatcase[ s[3] ];
d+=4;
s+=4;
}
for (i=0; i < n; i++)
{
d[i] = xlatcase[ s[i] ];
}
#else
for (i=0; i < m; i++)
{
d[i] = xlatcase[ s[i] ];
}
#endif
}
/*
*
*/
void acsmSetVerbose2(int n)
{
s_verbose = 1;
}
/*
*
*/
static void *
AC_MALLOC (int n)
{
void *p;
p = malloc (n);
if (p)
max_memory += n;
return p;
}
/*
*
*/
static void
AC_FREE (void *p)
{
if (p)
free (p);
}
/*
* Simple QUEUE NODE
*/
typedef struct _qnode
{
int state;
struct _qnode *next;
}
QNODE;
/*
* Simple QUEUE Structure
*/
typedef struct _queue
{
QNODE * head, *tail;
int count;
}
QUEUE;
/*
* Initialize the queue
*/
static void
queue_init (QUEUE * s)
{
s->head = s->tail = 0;
s->count= 0;
}
/*
* Find a State in the queue
*/
static int
queue_find (QUEUE * s, int state)
{
QNODE * q;
q = s->head;
while( q )
{
if( q->state == state ) return 1;
q = q->next;
}
return 0;
}
/*
* Add Tail Item to queue (FiFo/LiLo)
*/
static void
queue_add (QUEUE * s, int state)
{
QNODE * q;
if( queue_find( s, state ) ) return;
if (!s->head)
{
q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
MEMASSERT (q, "queue_add");
q->state = state;
q->next = 0;
}
else
{
q = (QNODE *) AC_MALLOC (sizeof (QNODE));
q->state = state;
q->next = 0;
s->tail->next = q;
s->tail = q;
}
s->count++;
}
/*
* Remove Head Item from queue
*/
static int
queue_remove (QUEUE * s)
{
int state = 0;
QNODE * q;
if (s->head)
{
q = s->head;
state = q->state;
s->head = s->head->next;
s->count--;
if( !s->head )
{
s->tail = 0;
s->count = 0;
}
AC_FREE (q);
}
return state;
}
/*
* Return items in the queue
*/
static int
queue_count (QUEUE * s)
{
return s->count;
}
/*
* Free the queue
*/
static void
queue_free (QUEUE * s)
{
while (queue_count (s))
{
queue_remove (s);
}
}
/*
* Get Next State-NFA
*/
static
int List_GetNextState( ACSM_STRUCT2 * acsm, int state, int input )
{
trans_node_t * t = acsm->acsmTransTable[state];
while( t )
{
if( t->key == input )
{
return t->next_state;
}
t=t->next;
}
if( state == 0 ) return 0;
return ACSM_FAIL_STATE2; /* Fail state ??? */
}
/*
* Get Next State-DFA
*/
static
int List_GetNextState2( ACSM_STRUCT2 * acsm, int state, int input )
{
trans_node_t * t = acsm->acsmTransTable[state];
while( t )
{
if( t->key == input )
{
return t->next_state;
}
t = t->next;
}
return 0; /* default state */
}
/*
* Put Next State - Head insertion, and transition updates
*/
static
int List_PutNextState( ACSM_STRUCT2 * acsm, int state, int input, int next_state )
{
trans_node_t * p;
trans_node_t * tnew;
// printf(" List_PutNextState: state=%d, input='%c', next_state=%d\n",state,input,next_state);
/* Check if the transition already exists, if so just update the next_state */
p = acsm->acsmTransTable[state];
while( p )
{
if( p->key == input ) /* transition already exists- reset the next state */
{
p->next_state = next_state;
return 0;
}
p=p->next;
}
/* Definitely not an existing transition - add it */
tnew = (trans_node_t*)AC_MALLOC(sizeof(trans_node_t));
if( !tnew ) return -1;
tnew->key = input;
tnew->next_state = next_state;
tnew->next = 0;
tnew->next = acsm->acsmTransTable[state];
acsm->acsmTransTable[state] = tnew;
acsm->acsmNumTrans++;
return 0;
}
/*
* Free the entire transition table
*/
static
int List_FreeTransTable( ACSM_STRUCT2 * acsm )
{
int i;
trans_node_t * t, *p;
if( !acsm->acsmTransTable ) return 0;
for(i=0;i< acsm->acsmMaxStates;i++)
{
t = acsm->acsmTransTable[i];
while( t )
{
p = t->next;
free(t);
t = p;
max_memory -= sizeof(trans_node_t);
}
}
free(acsm->acsmTransTable);
max_memory -= sizeof(void*) * acsm->acsmMaxStates;
acsm->acsmTransTable = 0;
return 0;
}
/*
*
*/
/*
static
int List_FreeList( trans_node_t * t )
{
int tcnt=0;
trans_node_t *p;
while( t )
{
p = t->next;
free(t);
t = p;
max_memory -= sizeof(trans_node_t);
tcnt++;
}
return tcnt;
}
*/
/*
* Print the trans table to stdout
*/
static
int List_PrintTransTable( ACSM_STRUCT2 * acsm )
{
int i;
trans_node_t * t;
ACSM_PATTERN2 * patrn;
if( !acsm->acsmTransTable ) return 0;
printf("Print Transition Table- %d active states\n",acsm->acsmNumStates);
for(i=0;i< acsm->acsmNumStates;i++)
{
t = acsm->acsmTransTable[i];
printf("state %3d: ",i);
while( t )
{
if( isprint(t->key) )
printf("%3c->%-5d\t",t->key,t->next_state);
else
printf("%3d->%-5d\t",t->key,t->next_state);
t = t->next;
}
patrn =acsm->acsmMatchList[i];
while( patrn )
{
printf("%.*s ",patrn->n,patrn->patrn);
patrn = patrn->next;
}
printf("\n");
}
return 0;
}
/*
* Converts row of states from list to a full vector format
*/
static
int List_ConvToFull(ACSM_STRUCT2 * acsm, acstate_t state, acstate_t * full )
{
int tcnt = 0;
trans_node_t * t = acsm->acsmTransTable[ state ];
memset(full,0,sizeof(acstate_t)*acsm->acsmAlphabetSize);
if( !t ) return 0;
while(t)
{
full[ t->key ] = t->next_state;
tcnt++;
t = t->next;
}
return tcnt;
}
/*
* Copy a Match List Entry - don't dup the pattern data
*/
static ACSM_PATTERN2*
CopyMatchListEntry (ACSM_PATTERN2 * px)
{
ACSM_PATTERN2 * p;
p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
MEMASSERT (p, "CopyMatchListEntry");
memcpy (p, px, sizeof (ACSM_PATTERN2));
p->next = 0;
return p;
}
/*
* Check if a pattern is in the list already,
* validate it using the 'id' field. This must be unique
* for every pattern.
*/
/*
static
int FindMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
{
ACSM_PATTERN2 * p;
p = acsm->acsmMatchList[state];
while( p )
{
if( p->id == px->id ) return 1;
p = p->next;
}
return 0;
}
*/
/*
* Add a pattern to the list of patterns terminated at this state.
* Insert at front of list.
*/
static void
AddMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
{
ACSM_PATTERN2 * p;
p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
MEMASSERT (p, "AddMatchListEntry");
memcpy (p, px, sizeof (ACSM_PATTERN2));
p->next = acsm->acsmMatchList[state];
acsm->acsmMatchList[state] = p;
}
static void
AddPatternStates (ACSM_STRUCT2 * acsm, ACSM_PATTERN2 * p)
{
int state, next, n;
unsigned char *pattern;
n = p->n;
pattern = p->patrn;
state = 0;