/*
* This file is part of echelon
* Copyright (c) 2003 Gianni Tedesco
* Released under the terms of the GNU GPL version 2
*/

#include <echelon.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <errno.h>
#include <string_hash.h>

struct word {
	u_int32_t hash;
	u_int32_t count;
	struct word *next;
	char str[0];
};

#define TOK_DELIM	(1<<0) /* Tokenize on this. Never appear in tokens */
#define TOK_STRIP_END	(1<<1) /* Never begin or end a token */
#define TOK_STRIP	(1<<2) /* Never appear in tokens */
static u_int8_t tokchars[0x100] = {
	[ 0x00 ] TOK_DELIM,
	[ 0x01 ] TOK_DELIM,
	[ 0x02 ] TOK_DELIM,
	[ 0x03 ] TOK_DELIM,
	[ 0x04 ] TOK_DELIM,
	[ 0x05 ] TOK_DELIM,
	[ 0x06 ] TOK_DELIM,
	[ 0x07 ] TOK_DELIM,
	[ 0x08 ] TOK_DELIM,
	[ 0x09 ] TOK_DELIM,
	[ 0x0a ] TOK_DELIM,
	[ 0x0b ] TOK_DELIM,
	[ 0x0c ] TOK_DELIM,
	[ 0x0d ] TOK_DELIM,
	[ 0x0e ] TOK_DELIM,
	[ 0x0f ] TOK_DELIM,
	[ 0x10 ] TOK_DELIM,
	[ 0x11 ] TOK_DELIM,
	[ 0x12 ] TOK_DELIM,
	[ 0x13 ] TOK_DELIM,
	[ 0x14 ] TOK_DELIM,
	[ 0x15 ] TOK_DELIM,
	[ 0x16 ] TOK_DELIM,
	[ 0x17 ] TOK_DELIM,
	[ 0x18 ] TOK_DELIM,
	[ 0x19 ] TOK_DELIM,
	[ 0x1a ] TOK_DELIM,
	[ 0x1b ] TOK_DELIM,
	[ 0x1c ] TOK_DELIM,
	[ 0x1d ] TOK_DELIM,
	[ 0x1e ] TOK_DELIM,
	[ 0x1f ] TOK_DELIM,
	[ ',' ] TOK_DELIM,
	[ ';' ] TOK_DELIM,
	[ ' ' ] TOK_DELIM,
	[ '(' ] TOK_DELIM,
	[ ')' ] TOK_DELIM,
	[ '{' ] TOK_DELIM,
	[ '}' ] TOK_DELIM,
	[ '[' ] TOK_DELIM,
	[ ']' ] TOK_DELIM,
	[ '?' ] TOK_STRIP_END,
	[ '!' ] TOK_STRIP_END,
	[ ':' ] TOK_STRIP_END,
	[ '.' ] TOK_STRIP_END,
	[ '\'' ] TOK_STRIP_END,
	[ '\"' ] TOK_STRIP_END,
};

static inline int lex_space(u_int8_t chr)
{
	return (tokchars[chr] & TOK_DELIM) | (chr & 0x80);
}

static inline int lex_strip_end(u_int8_t chr)
{
	return (tokchars[chr] & TOK_STRIP_END);
}

static inline int lex_strip(u_int8_t chr)
{
	return (tokchars[chr] & TOK_STRIP);
}

#define WHASH_SZ 1024
static struct word *wordhash[WHASH_SZ];
static unsigned int num_words;

static int count_cmp(const void *A, const void *B)
{
	struct word *a = *(struct word **)A;
	struct word *b = *(struct word **)B;
	if ( a->count > b->count )
		return -1;
	if ( a->count < b->count )
		return 1;
	return strcmp(a->str, b->str);
}

static void lex_summary(void)
{
	struct word *arr[num_words];
	struct word *w;
	int i, j=0;

	for(i=0; i < WHASH_SZ; i++) {
		for(w=wordhash[i]; w; w=w->next) {
			arr[j++] = w;
		}
	}

	qsort(arr, num_words, sizeof(*arr), count_cmp);

	msg(MSG_INFO, "There are %u words", num_words);
	
	for(i=0; i < num_words; i++) {
		msg(MSG_DEBUG, "word: %u: '%s'",
				arr[i]->count, arr[i]->str);
	}
}

static void lex_word(const char *buf)
{
	u_int32_t hash = string_hash(buf);
	u_int32_t bkt = hash & (WHASH_SZ-1);
	struct word *w;
	int len;

	for(w=wordhash[bkt]; w; w=w->next) {
		if ( w->hash == hash && !strcmp(w->str, buf) ) {
			w->count++;
			return;
		}
	}

	len = strlen(buf) + 1;
	w = malloc(sizeof(*w) + len);
	if ( w == NULL ) {
		msg(MSG_ERR, "%s: malloc(): %s", buf, sys_err());
		return;
	}

	w->hash = hash;
	w->count = 1;
	memcpy(w->str, buf, len);
	w->next = wordhash[bkt];
	wordhash[bkt] = w;
	num_words++;
}

static void lex_token(const u_int8_t *tok, size_t len)
{
	char buf[len+1];
	size_t in=0, out=0;
	size_t alpha;

	while ( len && lex_strip_end(tok[len-1]) )
		len--;
	while ( len && lex_strip_end(tok[in]) )
		in++;

	for(alpha=0; in < len; in++) {
		if ( !lex_strip(tok[in]) ) {
			buf[out++] = tolower(tok[in]);
			if ( isalpha(tok[in]) )
				alpha++;
		}
	}

	if ( out == 0 || alpha == 0 )
		return;

	buf[out] = '\0';

	lex_word(buf);
}

static int lex_mem(const u_int8_t *map, size_t len)
{
	const u_int8_t *tok = NULL;
	const u_int8_t *tmp = map;
	const u_int8_t *end = map + len;
	int state;

	for(state=0; tmp < end; tmp++) {
		switch (state) {
		case 0:
			if ( !lex_space(*tmp) ) {
				state = 1;
				tok = tmp;
			}
			break;
		case 1:
			if ( lex_space(*tmp) ) {
				lex_token(tok, tmp - tok);
				state = 0;
				tok = NULL;
			}
			break;
		}
		
	}

	return 1;
}

static int lex_file(const char *fn)
{
	struct stat st;
	size_t map_sz;
	int fd;
	int ret = 0;
	u_int8_t *map;

	fd = open(fn, O_RDONLY);
	if ( fd < 0 ) {
		msg(MSG_ERR, "%s: open(): %s", fn, sys_err());
		goto err;
	}

	if ( fstat(fd, &st) ) {
		msg(MSG_ERR, "%s: fstat(): %s", fn, sys_err());
		goto err_close;
	}

	/* never map more than 1/2 virtual address space,
	 * anything more is bound to fail, and we may aswell
	 * attempt to support large files...
	 */
	if ( st.st_size >= SSIZE_MAX )
		st.st_size = SSIZE_MAX;
	map_sz = (size_t)st.st_size;

	map = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, fd, 0);
	if ( map == MAP_FAILED ) {
		msg(MSG_ERR, "%s: mmap(): %s", fn, sys_err());
		goto err_close;
	}

	msg(MSG_DEBUG, "lexing: %s", fn);
	ret = lex_mem(map, map_sz);

	munmap(map, map_sz);
err_close:
	if ( close(fd) && errno == EINTR )
		goto err_close;
err:
	return ret;
}

int main(int argc, char **argv) 
{
	int i;

	for(i=1; i < argc; i++)
		lex_file(argv[i]);

	lex_summary();

	return 0;
}