/* word-cnt.c: count all words in a text file ** Written by Michael J. Gourlay ** Fall 1996, University of Colorado at Boulder ** Algorithms */ /* Top level description: ---------------------- This program reads a file, and counts the number of words occuring in the file. This is done by building a "dictionary" of the file, which is a list of all of the words occuring in the file. The "dictionary" is represented as a tree with an unlimited branching factor, where each node in the dicitonary has an associated letter and a linked list of children nodes whose letters are the subsequent letters in potential words. Also associated with each node is a number of "hits" which is the number of times a word made of the letters of the ancestors of that dictionary node, plus the letter associated with that dictionary node, is encountered in the file. Because the tree being built is such that order of construction matters, the tree can not be a sort tree (and hence not self balancing). It has to be a kind of radix tree with an indefinite branching factor. As the file is read, the dicitonary is created, and the number of "hits" is updated. After the file is read, the dictionary is "dumped", i.e. the contents are printed out along with the number of hits for each word. Nodes with a "hits" count greater than 0 are considered to be end-of-word nodes and only tallied words are considered complete words. */ #include #include #include #include /* for isalnum */ /* ----------------------- */ /* Utility functions */ /* ----------------------- */ #define MIN(x,y) ((x)<(y)?(x):(y)) /* my_calloc: memory allocation with error checking ** Arguments are the same as for standard library function "calloc" ** Exit program if memory allocation fails. */ void * my_calloc(size_t nelem, size_t elsize) { char *mem; if((mem=(char *)calloc(nelem, elsize))==NULL) { fprintf(stderr, "my_calloc: out of memory. Tried %i x %i\n", nelem, elsize); abort(); /* allow for hard core debugging -- not nice for production*/ exit(1); /* more smooth for distribution version */ } return mem; } /* MY_CALLOC: macro to do memory allocation easily ** ne (in): number of elements. Must be integer. ** type (in): type token (not a value). Must be a valid variable type. */ #define MY_CALLOC(ne,type) ((type *)my_calloc(ne,sizeof(type))) /* ======================== */ /* Singly linked list */ /* ======================== */ /* To grader: ** I used my linked list routines for this project, which I wrote ** years ago. Normally, of course, the code for these routines would ** not be inside this file. */ typedef struct LinkList_ { void *data; struct LinkList_ *link; } LinkListT; /* linklist.c -- routines and data structures to handle singly linked lists ** ** Written and Copyright (C) 1993-1996 by Michael J. Gourlay ** NO WARRENTIES, EXPRESS OR IMPLIED */ /* VERBOSE : verbose reporting, for testing */ /* LINK_LIST_TEST : create full test suite */ /* DRASTIC : drastic tests: could dump core */ /* ------------------------------------------------------------------------ ** linkListCreate -- allocate memory for a new node ** ** data (in): pointer to data associated with new node */ LinkListT * linkListCreate(void *data) { LinkListT *new; if((new=(LinkListT *)malloc(sizeof(LinkListT)))==NULL) { fprintf(stderr, "linkListCreate: could not allocate link\n"); return NULL; } else { new->data = data; new->link = NULL; return new; } } /* ------------------------------------------------------------------------ ** linkListInsert -- inserts the newnode before the node ** ** Note that the list argument is a pointer to pointer ** ** Returns -1 if node was not found in list. In which case, newnode ** is not added to the list. */ int linkListInsert(LinkListT **list, const LinkListT *node, LinkListT *newnode) { LinkListT *now, *prev; if(list==NULL) { #ifdef VERBOSE printf("linkListInsert: tried to insert to NULL list\n"); #endif /* VERBOSE */ return -2; } #ifdef VERBOSE printf("linkListInsert: before %p *=%p %p %p\n", list, *list, node, newnode); linkListPrint(*list, NULL); #endif /* VERBOSE */ /* Find "node" within the list */ for(prev=NULL,now= *list; (now!=node)&&(now!=NULL); prev=now,now=now->link) ; /* If "node" was found, insert newnode before node */ if((now != NULL) || ((*list == node) && (node == NULL))) { /* add tail of old list after new node */ newnode->link = now; if(prev!=NULL) { /* Don't try to insert before beginning of list */ /* point prev's link to new node */ prev->link = newnode; } else { /* If prev == NULL then we tried to insert before the beginning, ** in which case we want to point the list itself to the newnode. */ *list = newnode; } #ifdef VERBOSE printf("linkListInsert: after\n"); linkListPrint(*list, NULL); #endif /* VERBOSE */ return 0; } else { /* Else "node" was not found so return error indicator */ #ifdef VERBOSE printf("linkListInsert: did not find node %p\n", node); #endif /* VERBOSE */ return -1; } } /* ---------------- */ /* Dictionary */ /* ---------------- */ /* A dictionary is essentially a tree with an unlimited branching ** factor, where each node in the tree has an associated 'letter', and ** a list of 'children' nodes whose associated letters are the ** subsequent letters in a word. The 'hits' member is the number of ** words encountered which end in the letter associated with that ** node, such that the word is composed of the letters of the ** ancestors of that node, where the letters of the word are in the ** descending order of the tree, i.e., starting at the head node and ** following children. */ typedef struct { char letter; /* letter of this node */ long hits; /* number of hits of the word ending in this letter */ LinkListT *children; /* linked list of following letters, ** where each node in the linked list has ** link 'data' member pointing to a DictionaryT. ** i.e. this is a linked list of dictionary nodes. */ } DictionaryT; /* dictionaryCreate: create a dictionary node and associate with letter ** ** letter (in): letter to associate with new node ** ** New node has no children. ** New node 'hits' is set to zero. ** ** Returns address of the new dictionary node. */ DictionaryT * dictionaryCreate(const char letter) { DictionaryT *newnode; /* Allocate a new DictionaryT node */ newnode = MY_CALLOC(1, DictionaryT); /* Set the dictionary node letter member to argument letter */ newnode->letter = letter; /* Initialize the number of hits of the word ending in this letter */ newnode->hits = 0; /* Initialize the list of children of this dictionary node */ newnode->children = NULL; return newnode; } /* dictionarySelectLetter: select child of current node with given letter ** ** If there is no child associated with the argument letter, then ** create such a node and add it to the list of children of the ** argument dictionary node, dictP, then return that new node. ** ** (A possible speed-up heuristic would be to always move the selected ** child to the head of the list of children. This would, on average, ** make frequently selected children found faster in searching the list ** of children.) ** ** dictP (in/out): parent dictionary node ** letter (in): letter to select on ** ** Returns: selected child dictionary node. */ DictionaryT * dictionarySelectLetter(DictionaryT *dictP, const char letter) { DictionaryT *child; /* selection node, used to traverse the children */ { LinkListT *now; /* linked list traversal node */ /* Scan the children of this dictionary node for the argument letter */ for(now=dictP->children; (now!=NULL); now = now->link) { /* Type cast link data into a dictionary node */ child = (DictionaryT*) now->data; /* Check for sought letter */ if(child->letter == letter) { return child; } } } /* ---- If we get this far, letter was not found among children. ---- */ { LinkListT *child_link; /* linked list new child node */ /* Create a new dictionary node with the argument letter */ child = dictionaryCreate(letter); /* Prepend new node to list of children of argument dictionary node */ child_link = linkListCreate(child); linkListInsert(/* into this list */ &dictP->children, /* before this node */ dictP->children, /* insert this node */ child_link); } /* Return the new dictionary node */ return child; } /* dictionaryTally: read a file and tally words ** ** dictP (in/out): dictionary ** fp (in/out): file pointer of input text file ** ** Read a file, one character at a time, and build a "dictionary" ** which is a tree with an unlimited branching factor, which represents ** the words present in the file. */ void dictionaryTally(DictionaryT *dictP, FILE *fp) { DictionaryT *current; /* current dictionary node */ int letter; /* input letter */ #ifdef VERBOSE printf("dT: %p %p\n", dictP, fp); #endif /* VERBOSE */ /* Read and process each character of the input file stream */ while(!ferror(fp)) { /* Skip non-alpha-numeric input characters */ while(!isalnum(letter=fgetc(fp))) { #ifdef VERBOSE putchar(letter); #endif /* VERBOSE */ /* If we reach end of file during white space, exit this routine */ if(letter == EOF) { return; } } /* Beginning of a new word: Point to the top node of the dictionary. */ current = dictP; /* Process and read characters until next non-alpha-numeric */ for(; isalnum(letter); letter=fgetc(fp)) { #ifdef VERBOSE putchar(letter); #endif /* VERBOSE */ /* If end-of-file reached.. */ if(letter == EOF) { /* If the current dictionary node isn't the top node.. */ if(current != dictP) { /* Increment number of hits of the word ending in this letter */ current->hits ++; } else { /* Will this ever happen? ** -- Not possible, but I want to find out in case I'm wrong. */ printf("EOF in alpha-num at top node\n"); } /* Exit this routine */ return; } /* Select the child associated with the input letter. */ current = dictionarySelectLetter(current, letter); } /* -- If we get this far, we're at the end of a word. ** Increment number of hits of the word ending in this letter. */ current->hits ++; /* Point to top of dictionary to start over at a new word. */ current = dictP; } } /* dictionaryDump: dump (print) the contents of the dictionary ** I.e. prints how many times each word occured in the text which ** created the dictionary. ** ** Recursive. ** ** dictP (in): pointer to dictionary node ** word (in): word up to and including this node ** ** Each execution of dictionaryDump has as input the current node of ** the dictionary, dictP, and a word consisting of all of the letters ** of the ancestors of the current dictionary node, dictP, and the ** letter associated with the current dictionary node. */ void dictionaryDump(const DictionaryT *dictP, const char *word) { LinkListT *now; const DictionaryT *child; char *new_word; #ifdef VERBOSE printf("dD: %p %p\n", dictP, word); #endif /* VERBOSE */ /* If 'hits' is positive, Print word and hits associated with this node. */ if(dictP->hits > 0) { printf("%s occured %i times\n", word, dictP->hits); } /* For each child, call dictionaryDump. */ for(now=dictP->children; (now!=NULL); now = now->link) { /* Type cast link data into a dictionary node */ child = (DictionaryT*) now->data; if(word == NULL) { /* We're at top node so this is the first allocation */ new_word = malloc(2); sprintf(new_word, "%c", child->letter); } else { /* Lengthen the word memory */ new_word = malloc(strlen(word)+1); /* Create the new word by including child's letter */ sprintf(new_word, "%s%c", word, child->letter); } /* Recurse */ dictionaryDump(child, new_word); /* Free memory for new_word */ free(new_word); } } int main(int argc, char **argv) { DictionaryT *dictionary; /* top node of the dictionary */ char * filename; /* input filename */ FILE *fp; /* input file stream pointer */ /* Parse command line arguments: ** ** There are two valid ways to use this program: ** (1) word-cnt filename ** Opens file named 'filename' and reads from that file ** (2) word-cnt < filename or cat filename | word-cnt ** Reads from standard input. */ if(argc < 2) { /* If no filename was given, read from standard input. */ fp = stdin; } else if (argc == 2) { /* Use command line argument as filename. */ filename = argv[1]; if((fp=fopen(filename, "r"))==NULL) { fprintf(stderr, "%s: could not open file '%s'\n", argv[0], filename); exit(1); } } else { /* Too many command line arguments were given. */ fprintf(stderr, "usage: %s [filename]", argv[0]); exit(2); } /* Initialize top node of dictionary: ** The top node has no associated letter. It serves only as a ** branching-off node to its children nodes. Initially, the top ** node has no children. */ dictionary = dictionaryCreate('\0'); /* Read file and tally words by building a dictionary tree: */ dictionaryTally(dictionary, fp); /* Close the input file */ if(fp!=stdin) { fclose(fp); } /* Print out the tallied words: */ dictionaryDump(dictionary, NULL); /* pathological test cases: empty file file full of non-alpha-numeric file with only alpha-numeric file with only one letter words file with word at end file with single letter at end file with non-alpha-num at end file with 2 non-alpha-num at end */ /* Make Unix happy */ return 0; }