/* * This file is part of echelon * Copyright (c) 2003 Gianni Tedesco * Released under the terms of the GNU GPL version 2 */ #include #include #include #include #include #include #include #include #include #include #include #include struct word { u_int32_t hash; u_int32_t count; struct word *next; char str[0]; }; #define TOK_DELIM (1<<0) /* Tokenize on this. Never appear in tokens */ #define TOK_STRIP_END (1<<1) /* Never begin or end a token */ #define TOK_STRIP (1<<2) /* Never appear in tokens */ static u_int8_t tokchars[0x100] = { [ 0x00 ] TOK_DELIM, [ 0x01 ] TOK_DELIM, [ 0x02 ] TOK_DELIM, [ 0x03 ] TOK_DELIM, [ 0x04 ] TOK_DELIM, [ 0x05 ] TOK_DELIM, [ 0x06 ] TOK_DELIM, [ 0x07 ] TOK_DELIM, [ 0x08 ] TOK_DELIM, [ 0x09 ] TOK_DELIM, [ 0x0a ] TOK_DELIM, [ 0x0b ] TOK_DELIM, [ 0x0c ] TOK_DELIM, [ 0x0d ] TOK_DELIM, [ 0x0e ] TOK_DELIM, [ 0x0f ] TOK_DELIM, [ 0x10 ] TOK_DELIM, [ 0x11 ] TOK_DELIM, [ 0x12 ] TOK_DELIM, [ 0x13 ] TOK_DELIM, [ 0x14 ] TOK_DELIM, [ 0x15 ] TOK_DELIM, [ 0x16 ] TOK_DELIM, [ 0x17 ] TOK_DELIM, [ 0x18 ] TOK_DELIM, [ 0x19 ] TOK_DELIM, [ 0x1a ] TOK_DELIM, [ 0x1b ] TOK_DELIM, [ 0x1c ] TOK_DELIM, [ 0x1d ] TOK_DELIM, [ 0x1e ] TOK_DELIM, [ 0x1f ] TOK_DELIM, [ ',' ] TOK_DELIM, [ ';' ] TOK_DELIM, [ ' ' ] TOK_DELIM, [ '(' ] TOK_DELIM, [ ')' ] TOK_DELIM, [ '{' ] TOK_DELIM, [ '}' ] TOK_DELIM, [ '[' ] TOK_DELIM, [ ']' ] TOK_DELIM, [ '?' ] TOK_STRIP_END, [ '!' ] TOK_STRIP_END, [ ':' ] TOK_STRIP_END, [ '.' ] TOK_STRIP_END, [ '\'' ] TOK_STRIP_END, [ '\"' ] TOK_STRIP_END, }; static inline int lex_space(u_int8_t chr) { return (tokchars[chr] & TOK_DELIM) | (chr & 0x80); } static inline int lex_strip_end(u_int8_t chr) { return (tokchars[chr] & TOK_STRIP_END); } static inline int lex_strip(u_int8_t chr) { return (tokchars[chr] & TOK_STRIP); } #define WHASH_SZ 1024 static struct word *wordhash[WHASH_SZ]; static unsigned int num_words; static int count_cmp(const void *A, const void *B) { struct word *a = *(struct word **)A; struct word *b = *(struct word **)B; if ( a->count > b->count ) return -1; if ( a->count < b->count ) return 1; return strcmp(a->str, b->str); } static void lex_summary(void) { struct word *arr[num_words]; struct word *w; int i, j=0; for(i=0; i < WHASH_SZ; i++) { for(w=wordhash[i]; w; w=w->next) { arr[j++] = w; } } qsort(arr, num_words, sizeof(*arr), count_cmp); msg(MSG_INFO, "There are %u words", num_words); for(i=0; i < num_words; i++) { msg(MSG_DEBUG, "word: %u: '%s'", arr[i]->count, arr[i]->str); } } static void lex_word(const char *buf) { u_int32_t hash = string_hash(buf); u_int32_t bkt = hash & (WHASH_SZ-1); struct word *w; int len; for(w=wordhash[bkt]; w; w=w->next) { if ( w->hash == hash && !strcmp(w->str, buf) ) { w->count++; return; } } len = strlen(buf) + 1; w = malloc(sizeof(*w) + len); if ( w == NULL ) { msg(MSG_ERR, "%s: malloc(): %s", buf, sys_err()); return; } w->hash = hash; w->count = 1; memcpy(w->str, buf, len); w->next = wordhash[bkt]; wordhash[bkt] = w; num_words++; } static void lex_token(const u_int8_t *tok, size_t len) { char buf[len+1]; size_t in=0, out=0; size_t alpha; while ( len && lex_strip_end(tok[len-1]) ) len--; while ( len && lex_strip_end(tok[in]) ) in++; for(alpha=0; in < len; in++) { if ( !lex_strip(tok[in]) ) { buf[out++] = tolower(tok[in]); if ( isalpha(tok[in]) ) alpha++; } } if ( out == 0 || alpha == 0 ) return; buf[out] = '\0'; lex_word(buf); } static int lex_mem(const u_int8_t *map, size_t len) { const u_int8_t *tok = NULL; const u_int8_t *tmp = map; const u_int8_t *end = map + len; int state; for(state=0; tmp < end; tmp++) { switch (state) { case 0: if ( !lex_space(*tmp) ) { state = 1; tok = tmp; } break; case 1: if ( lex_space(*tmp) ) { lex_token(tok, tmp - tok); state = 0; tok = NULL; } break; } } return 1; } static int lex_file(const char *fn) { struct stat st; size_t map_sz; int fd; int ret = 0; u_int8_t *map; fd = open(fn, O_RDONLY); if ( fd < 0 ) { msg(MSG_ERR, "%s: open(): %s", fn, sys_err()); goto err; } if ( fstat(fd, &st) ) { msg(MSG_ERR, "%s: fstat(): %s", fn, sys_err()); goto err_close; } /* never map more than 1/2 virtual address space, * anything more is bound to fail, and we may aswell * attempt to support large files... */ if ( st.st_size >= SSIZE_MAX ) st.st_size = SSIZE_MAX; map_sz = (size_t)st.st_size; map = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, fd, 0); if ( map == MAP_FAILED ) { msg(MSG_ERR, "%s: mmap(): %s", fn, sys_err()); goto err_close; } msg(MSG_DEBUG, "lexing: %s", fn); ret = lex_mem(map, map_sz); munmap(map, map_sz); err_close: if ( close(fd) && errno == EINTR ) goto err_close; err: return ret; } int main(int argc, char **argv) { int i; for(i=1; i < argc; i++) lex_file(argv[i]); lex_summary(); return 0; }