/* text.c: text searching algorithm ** Written by Michael J. Gourlay ** Fall 1996, University of Colorado at Boulder ** Algorithms */ /* Top level description: ---------------------- A text pattern searching algorithm is implemented using the Knuth-Morris-Pratt algorithm, which is a variation of a finite-state automaton. One advantage of using a finite state automaton is that the each character in the input text is read only once, so the algorithm is fast and needs only as much storage as the associated with the search patterns. A "transition function" is used to move the finite state machine from one state to another depending on (a) the current state and (b) the current input. The KMP variation is to skip the precomputation of the transition function and instead compute the transition function on the fly using a "prefix function", which is a function which contains information about the self-overlapping nature of the search pattern. My variation of the KMP algorith allows for an unlimited number of simultanous search patterns to be searched for in the text, while retaining the property that the text is scanned only once. I do this by keeping a list of search patterns, and searching for each pattern independantly but simultaneously in the KMP algorithm. Accompanying the pattern search routines is a text program which reads in a list of patterns from standard input, then reads a text file and searches for each of the patterns in the file. The program behaves a little like "grep" in that occurences of the search pattern in the text are reported with information indicating exactly where in the text file the search pattern was found. */ #include #include #include /* ----------------------- */ /* Utility functions */ /* ----------------------- */ #define MIN(x,y) ((x)<(y)?(x):(y)) /* my_calloc: memory allocation with error checking ** Arguments are the same as for standard library function "calloc" ** Exit program if memory allocation fails. */ void * my_calloc(size_t nelem, size_t elsize) { char *mem; if((mem=(char *)calloc(nelem, elsize))==NULL) { fprintf(stderr, "my_calloc: out of memory. Tried %i x %i\n", nelem, elsize); abort(); exit(1); } return mem; } /* MY_CALLOC: macro to do memory allocation easily ** ne (in): number of elements. Must be integer. ** type (in): type token (not a value). Must be a valid variable type. */ #define MY_CALLOC(ne,type) ((type *)my_calloc(ne,sizeof(type))) /* -------------------- */ /* Pattern object */ /* -------------------- */ /* PatternT: structure for a search pattern */ typedef struct { char *string; /* pattern string */ long len; /* length of pattern (also length of prefix function) */ long *prefix; /* prefix function of Knuth-Morris-Pratt */ } PatternT; /* patternAlloc: set len and allocate space for the PatternT members ** patP (in/out): pointer to PatternT ** len (in): length of pattern string and prefix function */ void patternAlloc(PatternT *patP, const long len) { patP->len = len; patP->string = MY_CALLOC(len+1, char); patP->string[len] = '\0'; patP->prefix = MY_CALLOC(len, long); } /* patternFree: free memory allocated to pattern members ** Does NOT free space for pattern structure -- only the alloc'd members */ void patternFree(PatternT *patP) { free(patP->string); patP->string = NULL; free(patP->prefix); patP->prefix = NULL; patP->len = 0; } /* patternComputePrefixFunction: Compute Knuth-Morris-Pratt prefix ** patP(in/out): pattern. patP->len and patP->string must already be set. ** patP->prefix member is set in this routine. */ void patternComputePrefixFunction(PatternT *patP) { long ki; long qi; patP->prefix[0] = 0; ki = 0; for(qi=1; qi < patP->len; qi++) { while((ki > 0) && (patP->string[ki] != patP->string[qi])) { ki = patP->prefix[ki-1]; } if(patP->string[ki] == patP->string[qi]) { ki ++; } patP->prefix[qi] = ki; } } /* patternInit: allocate and initialize a pattern. ** patP (out): pattern whose members are set. ** patP->string is allocated and set to string. ** patP->len is set to len. ** patP->prefix is computed. ** string (in): pattern string ** len (in): length of pattern string. ** len is provided because string may contain NULL characters. */ void patternInit(PatternT *patP, const char *string, const long len) { /* Allocate space for the pattern members */ patternAlloc(patP, len); /* Copy the argument string into the pattern string */ memcpy(patP->string, string, len); /* Compute the Knuth-Morris-Pratt prefix function for the pattern */ patternComputePrefixFunction(patP); } /* patternRead: read a pattern string from a file ** Sets patP->string to the pattern string. ** Sets patP->len to the length of the pattern string. ** Computes and sets patP->prefix. ** ** Pattern string can be any length that will fit into memory. ** ** patP (out): pattern ** fp (in/out): file pointer */ void patternRead(PatternT *patP, FILE *fp) { int inchar; char *buf; /* input buffer */ long len; /* number of characters read in so far */ long size=128; /* current size of the input buffer */ buf = MY_CALLOC(size, char); for(len=0; ; len++) { if(len >= size) { /* Allocate some more space for the pattern buffer */ size += 128; buf = realloc(buf, size * sizeof(char)); } /* Read in the next character of the pattern */ inchar = fgetc(fp); /* If we read an end-of-line character, that's the end of the pattern */ if((inchar == '\n') || (inchar == EOF)) { buf[len] = '\0'; break; } buf[len] = inchar; } /* Done reading pattern string */ /* Copy the buffer into the pattern string. */ patternInit(patP, buf, len); /* Free the buffer */ free(buf); } /* patternPrint: print a pattern (for diagnostic purposes) */ void patternPrint(const PatternT *patP) { int pi; /* Print the characters of the pattern string. ** This is done explicitly, character by character, ** because in general the pattern my contain NULL characters. */ for(pi=0; pi < patP->len; pi++) { printf("%c", patP->string[pi]); } #ifdef VERBOSE printf("\n"); /* Print the pattern prefix function */ for(pi=0; pi < patP->len; pi++) { printf(" %i", patP->prefix[pi]); } printf("\n"); #endif /* VERBOSE */ } /* patternReadFile: read a list of search patterns from an input stream ** patPPP (in/out): pointer to an array of pointers to patterns. ** The array of pointers and the patterns are allocated here. ** fp (in): file pointer. ** ** File structure: ** line 1 : integer. Number of pattern strings to follow. ** lines 2+: pattern strings, one per line. */ int patternReadFile(PatternT ***patPPP, FILE *fp) { int num_pat; int pi; /* Read from file the number of patterns to be read */ fscanf(fp, "%i\n", &num_pat); /* Allocate an array of patterns */ *patPPP = MY_CALLOC(num_pat, PatternT*); for(pi=0; pi < num_pat; pi++) { /* Allocate a PatternT */ (*patPPP)[pi] = MY_CALLOC(1, PatternT); } /* Read each pattern from standard input */ for(pi=0; pi < num_pat; pi++) { /* Read in the pattern string and compute the prefix function */ patternRead((*patPPP)[pi], fp); } return num_pat; } /* textKmpMatcher: Knuth-Morris-Pratt text pattern matcher for a file. ** ** fp (in): input file pointer. Must already be open for reading. ** ** pats (in): array of pointers to patterns ** The search patterns are assumed to have their search string, ** prefix function, and length members set before calling this ** routine. ** ** The patterns are not altered in this routine. ** ** num_pat (in): number of patterns in pats ** ** This is an implementation of the Knuth-Morris-Pratt text matching ** algorithm, implemented to search a file for multiple search ** patterns. The file is read, one character at a time, and is used as ** the text string. Not more than a single character of the text file ** is stored at a time. The file is read only once. All of the patterns ** are searched for simultaneously. */ void textKmpMatcher(FILE *fp, PatternT *pats[], const int num_pat) { long ti; /* text character index (starts at 1) */ int tc; /* input text character */ long lines; /* line counter, just for more useful information */ long chars; /* char-of-line counter, just for more useful info */ int pi; /* pattern index: 0 <= pi < num_pat */ long *qi; /* array of pattern string indices, one per pattern */ /* Allocate and initialize the pattern string indices */ qi = MY_CALLOC(num_pat, long); /* Read the text file, one character at a time */ for(chars=lines=ti=1; !feof(fp); chars++, ti++) { tc = fgetc(fp); /* Take a character from the input stream */ if(tc == EOF) { /* See whether this is end of file */ break; } else if(tc == '\n') { lines++; chars=1; } /* For each pattern, search the text a'la Knuth-Morris-Pratt */ for(pi=0; pi < num_pat; pi++) { /* ---- Compute transition function (delta) */ /* The while loop uses the statement that either ** delta(qi, tc) == 0 ** or delta(qi, tc) - 1 (is an element of) prefix^*[qi] */ while((qi[pi] > 0) && (pats[pi]->string[qi[pi]] != tc)) { /* Compute qi = prefix^*, the iterated prefix function */ qi[pi] = pats[pi]->prefix[qi[pi]-1]; } if(pats[pi]->string[qi[pi]] == tc) { qi[pi] ++; } /* ---- End of computation of transition function (delta) */ if(qi[pi] == pats[pi]->len) { printf("pattern '"); patternPrint(pats[pi]); printf("' occurs with shift %5li (at line %3li,char %2li)\n", ti-pats[pi]->len, lines, chars-pats[pi]->len); qi[pi] = pats[pi]->prefix[qi[pi]-1]; } } /* for pi */ } /* for ti */ } /* Read a list of patterns, ** open a text file, ** and search it for the patterns. */ int main(int argc, char **argv) { char *text_filename; int num_pat; PatternT **pats; int pi; FILE *text_fp; /* Parse command line arguments */ if(argc != 2) { printf("usage: %s \n", argv[0]); exit(1); } else { text_filename = argv[1]; } /* Read the list of patterns from standard input */ num_pat = patternReadFile(&pats, stdin); #ifdef VERBOSE /* Print the patterns */ for(pi=0; pi < num_pat; pi++) { printf("pat[%i] = ", pi); patternPrint(pats[pi]); } #endif /* VERBOSE */ /* Open the text file */ if((text_fp=fopen(text_filename, "r"))==NULL){ printf("%s: could not open text file '%s'\n", argv[0], text_filename); exit(2); } /* Read and search the text file */ textKmpMatcher(text_fp, pats, num_pat); /* Free the patterns */ for(pi=0; pi < num_pat; pi++) { patternFree(pats[pi]); } /* Free the array of pattern pointers */ free(pats); /* Make unix happy */ return 0; }