/*--------------------------------------------------------------------- * All portions of code are copyright by their respective author/s. * Copyright (C) 1996-2001 Vuthichai Ampornaramveth * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *---------------------------------------------------------------------*/ /* Thai word-separator by dictionary */ /* By Vuthichai A. */ #define DICTFILE "tdict.txt" #define MAXWORD 50000 #define MAXWORDLENGTH 30 #define MAXLINELENGTH 400 #define MAXSTATE 100000 #include #include #include int levtable[]={ 0,2,0,0,2,2,2,2,1,1,1,0,0,0,0,0, 0,0,0,0,0,0,0,2,3,3,3,3,3,2,3,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0 }; int map[MAXSTATE][96]; int state[MAXSTATE]; int maxstate, rnumword; int mincol, maxcol; void readfile(char *); void fixline(unsigned char *); void dooneline(unsigned char *,unsigned char *); int findword(unsigned char *); void add2map(unsigned char *,int); void initmap(); void prmap(const char* cfile, const char* hfile); unsigned char *wordptr[MAXWORD]; int numword; /* Number of words in memory */ /* Usage : dict2state */ int main(int argc, char* argv[]) { FILE *fopen(); int i, count, nn; numword = nn = 0; /* Sort while reading */ readfile(argc >= 2 ? argv[1] : DICTFILE); initmap(); for(i=0;i= 3 ? argv[2] : 0, argc >= 4 ? argv[3] : 0); return 0; } void readfile(char *fname) { FILE *fp, *fopen(); unsigned char str[MAXWORDLENGTH]; unsigned char *ostr; int l, i; fp = fopen(fname,"r"); while(!feof(fp)) { fgets((char*)str,MAXWORDLENGTH-1,fp); if(!feof(fp)) { fixline(str); wordptr[numword] = (unsigned char *)malloc((l=strlen((char*)str))+2); if(wordptr[numword]==NULL) printf("Memory Error\n"); strcpy((char*)wordptr[numword]+1, (char*)str); wordptr[numword][l]=0; /* Remove new line */ wordptr[numword][0] = l-1; if(numword > 0) if(strcmp((char*)wordptr[numword]+1,(char*)wordptr[numword-1]+1)<0) { ostr = wordptr[numword]; i = numword; while(i && (strcmp((char*)ostr+1,(char*)wordptr[i-1]+1)<0)) { wordptr[i] = wordptr[i-1]; i--; } wordptr[i]=ostr; } numword++; if(numword % 2000 ==0) printf("%d\n",numword); } } fclose(fp); printf("Reading dictionary done.\n"); } void fixline(line) unsigned char *line; { unsigned char top,up,middle,low; unsigned char out[MAXLINELENGTH]; int i,j,c; i=j=0; strcpy((char*)out,(char*)line); top=up=middle=low=0; while( (c=out[i++]) ) { switch((c>0xD0)?levtable[c-0xD0]:0) { case 0 : if(middle) { line[j++]=middle; if(low) line[j++]=low; if(up) line[j++]=up; if(top) line[j++]=top; } top=up=middle=low=0; middle=c; break; case 1 : low=c; break; case 2 : up=c; break; case 3 : top=c; break; } } if(middle) { line[j++]=middle; if(low) line[j++]=low; if(up) line[j++]=up; if(top) line[j++]=top; } line[j]=0; } void initmap() { int i,j; for(i=0;imaxcol) maxcol=cc; if(cc0) { curstate = map[curstate][c]; /* printf("%c to %d\n", cc, curstate); */ } else { map[curstate][c] = maxstate; curstate=maxstate; if(i==len-1) { state[maxstate] = 1; /* printf("%s at state %d\n", str, maxstate); */ } maxstate++; if(maxstate >= MAXSTATE) { fprintf(stderr,"Not Enough No. of States\n"); exit(1); } } i++; } } void prmap(const char* cfile, const char* hfile) { FILE *FP; int i,j,c; int state_min[MAXSTATE]; int state_max[MAXSTATE]; int state_offset[MAXSTATE]; int offset,min,max; c=maxcol-mincol+1; offset=0; printf("Writing Map File...\n"); FP=fopen(cfile ? cfile : "map.c","w"); fprintf(FP,"unsigned short map[] = {\n"); for(i=0;i