Logo Search packages:      
Sourcecode: hanzim version File versions  Download package

hinit.c

/* hinit.c: functions to read in databases and initialize all character
   information arrays. */

/* This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.  Please see the file "COPYING"
   for details.  If you have not received it along with this program,
   please write to the Free Software Foundation, Inc., 59 Temple Place,
   Suite 330, Boston, MA 02111-1307, USA. */

#include "hanzim.h"

#include <sys/types.h>
#include <sys/stat.h>

/* max length of a "line" (sequence by newline character in file) */
#define LINELEN 160

/* init_vars allocates and clears out arrays, returning 0/1 on fail/succeed */
char
init_vars(char *datadir)
{
  int       i,j;
  characterst     *cht;
  FILE            *fp;
  char            line[LINELEN];

  /* get info on database sizes */
  fp = Dopen(datadir,"zidianfl.b5");
  for (Nchar=0;getnextline(fp,line, LINELEN);Nchar++);
  fclose(fp);
  fp = Dopen(datadir,"cidianf.gb");
  for (Ncomp=0;getnextline(fp,line, LINELEN);Ncomp++);
  fclose(fp);
  fp = Dopen(datadir,"sanzicidianf.gb");
  for (Ntriplet=0;getnextline(fp,line, LINELEN);Ntriplet++);
  fclose(fp);
  
  Nchar += NRAD;  /* in case need extra to represent radicals */

  /* allocate */
  pinyintbl = malloc(sizeof(pinyinst) * NPINYIN);
  radtbl = malloc(sizeof(radicalst) * NRAD);
  chartbl = malloc(sizeof(characterst) * Nchar);
  comptbl = malloc (sizeof(compoundst) * Ncomp);
  triplettbl = malloc (sizeof(tripletst) * Ntriplet);
  fnt2chr = malloc(sizeof(int) * 1<<16);  /* any character we could look up */

  if (!pinyintbl || !radtbl || !chartbl || !comptbl || !triplettbl || !fnt2chr)
    return(0);    /* allocation failure */

  /* initialize */

  for (i=0;i<NPINYIN;i++) {
    pinyintbl[i].init = pinyintbl[i].finl = pinyintbl[i].tone = -1;
    pinyintbl[i].chars.n_entries = 0;
    }

  for (i=0;i<NRAD;i++) {
    radtbl[i].charidx = radtbl[i].pinyinidx = -1;
    radtbl[i].meaning[0] = '\0';
    radtbl[i].hassimp = radtbl[i].hasalt = radtbl[i].hasfull = 0;
    for (j=0;j<4;j++) radtbl[i].chars[j].n_entries = 0;
    }

  for (i=0;i<Nchar;i++) {
    cht = &(chartbl[i]);
    cht->fontidx = cht->big5idx = cht->radidx = cht->radtype = cht->radpos =
      cht->remainder = -1;
    cht->freq = 0;
    for (j=0;j<10;j++) cht->compidx[j] = -1;
    cht->meaning[0] = cht->utf_jh[0] = cht->utf_ft[0] = '\0';
    cht->Rpart.n_entries = cht->Lpart.n_entries = cht->pinyins.n_entries = 
      cht->remOf.n_entries = 0;
    }

  for (i=0;i<Ncomp;i++) {
    comptbl[i].Lcharidx = comptbl[i].Rcharidx = -1;
    comptbl[i].freq = 0;
    comptbl[i].meaning[0] = '\0';
    }

  for (i=0;i<Ntriplet;i++) {
    triplettbl[i].char1idx=triplettbl[i].char2idx=triplettbl[i].char3idx = -1;
    triplettbl[i].freq = 0;
    triplettbl[i].meaning[0] = '\0';
    }

  for (i=0;i<(1<<16);i++)
    fnt2chr[i] = -1;

  return(1);  /* success */
}


/* kanzi() reads in information from zidian database: character, frequency
   percentile, pinyin, and English gloss, and stores it in pinyin and
   character arrays */
void
kanzi(char *datadir)
{
  char            line[LINELEN], lineb5[20],
            *tpos;
  FILE            *fp, *fpb5;
  pinyinst  *pyt;       /* shorthands */
  characterst     *cht;
  uchar           fidxh, fidxl;     /* font characters */
  char            pystr[10],  /* pinyin */
            defn[DEFNL];
  int       fntidx, pyidx, cidx;    /* indices */
  int       freq;       /* frequency percentile */
  int       c = 0;
  char            compron,    /* flag: most frequent pronunciation? */
            secf;       /* flag: secondary defn/pron entry? */
  char            datafile[200];    /* data directory + filename for error msg */

  fp = Dopen(datadir,"zidianf.gb");
  fpb5 = Dopen(datadir,"zidianfl.b5");
  sprintf(datafile,"%s/%s",datadir,"zidianf.gb");

  /* here we read in a list of characters in pinyin order, initializing
     character indices in this order */
  while (getnextline(fp,line, LINELEN) && getnextline(fpb5,lineb5, 20)) {

    /*get first 3 entries, skip, and scan rest into definition */
    sscanf(line,"%c%c\t%d\t%s\t",&fidxh,&fidxl,&freq,pystr);
    tpos = skip(2,line);
    /* scan ourselves as \0 counts as nonspace */
    while (!isspace(*tpos) && *tpos != '\0') tpos++;
    if (*tpos == '\0') *defn = '\0';
    else {
      while (isspace(*tpos)) tpos++;
      strncpy(defn,tpos,DEFNL-1);
      defn[DEFNL-1] = '\0';   /* make sure null-terminated */
      }
    /* check if pinyin string signals secondary entry */
    if (pystr[strlen(pystr)-1] == '*') {
      secf = 1;
      pystr[strlen(pystr)-1] = '\0';
      }
    else
      secf = 0;

    if (strlen(defn)) { /* only bother with characters that have definitions */
      compron = 0;            /* will need this later */

      /* character stuff */
      fntidx = f_char2int(fidxh,fidxl);      /* compute font index */
      if (fnt2chr[fntidx] < 0) {    /* have a new char; initialize */
      cht = &(chartbl[c]);
      cht->fontidx = fntidx;
      strcat(cht->meaning,defn);
      cht->freq = freq;
      if (!(cht->pinyins.ent = malloc(sizeof(int) * MAXCPRON)))
          memfail("char pinyin");
      if (!(cht->remOf.ent = malloc(sizeof(int) * MAXREMOF)))
          memfail("char remainder");
      fnt2chr[fntidx] = c;
      /* scan the big5 character we read */
      sscanf(lineb5,"%c%c\t%s",&fidxh,&fidxl,cht->utf_bpmf);
      cht->big5idx = f_char2int(fidxh,fidxl);
      c++;
      }
      else {                        /* multi meaning/pron */
      cht = &(chartbl[fnt2chr[fntidx]]);
      /* set to meaning and freq of more common pronunciation */
      if (!secf) {  /* this should be primary entry: overwrite old def/pro */
        cht->freq = freq;
        strcpy(cht->meaning,defn);
        compron = 1;    /* mark for later switching of pinyin */
          sscanf(lineb5,"%*c%*c\t%s",cht->utf_bpmf);  /* overwrite bpmf */
        }
      }
      cidx = fnt2chr[fntidx];

      /* pinyin stuff */
      pyidx = pin_index(pystr);
      if (pyidx < 0) badfile(datafile,pystr);
      pyt = &(pinyintbl[pyidx]);
      if (pyt->init < 0) {    /* have a new pinyin, so initialize */
      pyt->tone = pystr[strlen(pystr)-1] - '1';
      pystr[strlen(pystr)-1] = '\0';  /* cheat */
      pyt->init = pin_init(pystr)-1, pyt->finl = pin_fin(pystr)-1;
      pyt->chars.ent = malloc(sizeof(int) * MAXPCHAR);
      if (!(pyt->chars.ent)) memfail("pinyin table");
      } /* new pinyin */
      if (pyt->chars.n_entries < MAXPCHAR) {
      /* only add char to list if it isn't a secondary entry */
      if (!secf)
        pyt->chars.ent[pyt->chars.n_entries++] = cidx;
      }
      else
      printf("\n*WARNING: too many characters for pronunciation %s\n",pystr);

      if (cht->pinyins.n_entries < MAXCPRON) {
      if (compron) {    /* make sure most common pronunciation is in entry 0 */
        cht->pinyins.ent[cht->pinyins.n_entries++] = cht->pinyins.ent[0];
        cht->pinyins.ent[0] = pyidx;
        }
      else
        cht->pinyins.ent[cht->pinyins.n_entries++] = pyidx;
      }
      else printf("\n**WARNING: too many pronunciations for character %d\n",
              cidx);

      } /* if have definition */
    } /* while have lines */

  nchars = c;
  fclose(fp), fclose (fpb5);
  if (!quiet) printf("Zidian: %d characters read in.\n",nchars);

}


/* function kanhe() reads in compound database and sets up the storage of
   this information */
void
kanhe(char *datadir)
{
  char            fname[200], line[LINELEN];
  FILE            *fp;
  characterst     *cht;
  idxlist   *wp;
  uchar           fidxh1, fidxl1,     /* font characters */
            fidxh2, fidxl2;
  int       fntidx1, fntidx2, /* indices */
            cidx1, cidx2,
            i,j,
            nr,nl,
            freq,
            c = 0;
  char            *tpos;
  /* for file header */
  float           version;
  int       zeroes[3], tnchars, tncomps;
  char            zerocs[3], binFileRead;

  fp = Dopen(datadir,"cidianf.gb");

  /* first we read everything into arrays so we can work with it,
   storing what we can in the character table along the way */
  while (getnextline(fp,line, LINELEN)) {
    sscanf(line,"%c%c%c%c %d",&fidxh1,&fidxl1,&fidxh2,&fidxl2,&freq);
    tpos = skip(2,line);
    fntidx1 = f_char2int(fidxh1,fidxl1), fntidx2 = f_char2int(fidxh2,fidxl2);
    cidx1 = fnt2chr[fntidx1], cidx2 = fnt2chr[fntidx2];
    if ((cidx1 > 0) && (cidx2 > 0)) {     /* only do if we have both chars */
      comptbl[c].freq = freq;
      strncpy(comptbl[c].meaning,tpos, DEFNL-1);
      comptbl[c].meaning[DEFNL-1] = '\0';  /* make sure null-terminated */
      comptbl[c].Lcharidx = cidx1, comptbl[c].Rcharidx = cidx2;
      chartbl[cidx1].Rpart.n_entries++, chartbl[cidx2].Lpart.n_entries++;
      c++;
    }
  }

  fclose(fp);
  ncomps = c;
  if (!quiet) printf("Cidian: %d compounds read in.\n",ncomps);

  /* check for link database file */
  sprintf(fname,"%s/%s",datadir,"hcompound.dat");

  binFileRead = 0;
  if ((fp = fopen(fname,"rb"))) {   /* we're in luck */
    fread(zeroes, sizeof(int), 3, fp);
    fread(zerocs, sizeof(char), 3, fp);
    fread(&version, sizeof(float), 1, fp);
    fread(&tnchars, sizeof(int), 1, fp);
    fread(&tncomps, sizeof(int), 1, fp);
    if ((zeroes[0] == 0) && (zeroes[1] == 0) && (zeroes[2] == 0) &&
        (zerocs[0] == 0) && (zerocs[1] == 0) && (zerocs[2] == 0) &&
        ((version - HANZIM_VERSION) < 0.00001) &&  /* (float inexactness) */
        (tnchars == nchars) && (tncomps == ncomps)) {
      for (i=0;i<nchars;i++) {
        cht = &(chartbl[i]);
        fread(&nr,sizeof(int),1,fp);
        fread(&nl,sizeof(int),1,fp);
        if (!(cht->Rpart.ent = malloc(sizeof(int) * nr)) ||
            !(cht->Lpart.ent = malloc(sizeof(int) * nl)))
          memfail("compound link storage");
        cht->Rpart.n_entries = nr, cht->Lpart.n_entries = nl;
        fread(cht->Rpart.ent,sizeof(int),nr,fp);
        fread(cht->Lpart.ent,sizeof(int),nl,fp);
        }
      binFileRead = 1;
      } else {
      fprintf(stderr, "\nFound old compound link database.\n");
      }
    fclose(fp);
    }

  if (!binFileRead) {
    /* compile the lists of compounds each character participates in */
    printf("Compiling compound link database (first time only for this version; \ncould take a while)...\n");

    /* Go through and copy info into character table, first for right partner
       lists, then for left partner lists */
    for (i=0;i<nchars;i++) {
      wp = &(chartbl[i].Rpart);
      if (wp->n_entries) {
      wp->ent = malloc(sizeof(int) *wp->n_entries);
      for (c=0,j=0;j<ncomps;j++)
        if (comptbl[j].Lcharidx == i) {
          wp->ent[c] = j;
          c++;
          }
      }
      }

    for (i=0;i<nchars;i++) {
      wp = &(chartbl[i].Lpart);
      if (wp->n_entries) {
      wp->ent = malloc(sizeof(int) * wp->n_entries);
      for (c=0,j=0;j<ncomps;j++)
        if (comptbl[j].Rcharidx == i) {
          wp->ent[c] = j;
          c++;
          }
      }
      }

    umask(0022);
    if (!(fp = fopen(fname,"wb"))) {
      fprintf(stderr, "**Error: cannot write %s.\n",fname);
      fprintf(stderr, "Please check permissions and disk fullness.\n");
      exit(1);
    }
    printf("Writing out database.\n");
    /* Header: three 0 ints, three 0 chars, float version number,
       nchars, ncomps  */
    zeroes[0] = zeroes[1] = zeroes[2] = 0;
    zerocs[0] = zerocs[1] = zerocs[2] = 0;
    version = HANZIM_VERSION;
    fwrite(zeroes, sizeof(int), 3, fp);
    fwrite(zerocs, sizeof(char), 3, fp);
    fwrite(&version, sizeof(float), 1, fp);
    fwrite(&nchars, sizeof(int), 1, fp);
    fwrite(&ncomps, sizeof(int), 1, fp);
    for (i=0;i<nchars;i++) {
      cht = &(chartbl[i]);
      fwrite(&cht->Rpart.n_entries,sizeof(int),1,fp);
      fwrite(&cht->Lpart.n_entries,sizeof(int),1,fp);
      fwrite(cht->Rpart.ent,sizeof(int),cht->Rpart.n_entries,fp);
      fwrite(cht->Lpart.ent,sizeof(int),cht->Lpart.n_entries,fp);
      }
    fclose(fp);
    }

}


/* function  kanbu() reads in character component information from a radicals
   file and a breakdown file, setting up radical fields and structure array */
void
kanbu(char *datadir)
{
  char            line[LINELEN];
  FILE            *fp;
  characterst     *cht, *chtR;
  uchar           fidxh, fidxl,       /* font characters */
            fidxh2, fidxl2;
  int       fntidx1, cidx, pyidx, /* indices */
            i,j,r=0, rn, rc, ns;
  char            rstr[80], pos,
            pystr[10];
  int       n_ent[NRAD][4];   /* keep track how many for allocation */
  char            datafile[200];    /* data directory + filename for error msg */

  /* zero the n_ent count array */
  for (i=0;i<NRAD;i++)
    for(j=0;j<4;j++)
      n_ent[i][j] = 0;

  /* first read in radical table */
  fp = Dopen(datadir,"bushou.gb");
  sprintf(datafile,"%s/%s",datadir,"bushou.gb");

  while (getnextline(fp,line, LINELEN)) {

    sscanf(line,"%s %d",rstr,&ns);
    rn = radnum(rstr) + 1;
    rc = radtype(rstr);

    if (rn > r) { /* new radical */
      if (rc == RAD_S) radtbl[r].hassimp = 1;
      sscanf(line,"%*s\t%*d\t%c%c\t%s",&fidxh,&fidxl,pystr);
      pyidx = pin_index(pystr);
      if (pyidx < 0) badfile(datafile, pystr);
      radtbl[r].pinyinidx = pyidx;
      radtbl[r].strokes = ns;
      strncpy(radtbl[r].meaning,skip(4,line), DEFNL-1);
      radtbl[r].meaning[DEFNL-1] = '\0';  /* make sure null-terminated */
      fntidx1 = f_char2int(fidxh,fidxl);
      /* if already have a character in db with this font rep, use it */
      if (fnt2chr[fntidx1] >= 0)
      radtbl[r].charidx = fnt2chr[fntidx1];
      else {      /* set up a special character with this font rep */
      fnt2chr[fntidx1] = nchars;
      cht = &(chartbl[nchars]);
      cht->fontidx = fntidx1;
      if (!(cht->pinyins.ent = malloc(sizeof(int) * MAXCPRON)))
          memfail("char pinyin");
      if (!(cht->remOf.ent = malloc(sizeof(int) * MAXREMOF)))
          memfail("char remainder");
      cht->pinyins.ent[0] = radtbl[r].pinyinidx;
      cht->pinyins.n_entries = 1;
      cht->freq = 99;   /* prevent from being used (except as remainder) */
      radtbl[r].charidx = nchars;
      nchars++;
      }
      r++;
      }
    
    else if (rc == RAD_A)
      radtbl[r-1].hasalt = 1;
    else if (rc == RAD_F)
      radtbl[r-1].hasfull = 1;

    }

  fclose(fp);

  /* now read in character composition database */
  fp = Dopen(datadir,"parts.gb");

  while (getnextline(fp,line, LINELEN)) {
    sscanf(line,"%c%c %s %c%c %c",&fidxh,&fidxl,rstr,&fidxh2,&fidxl2,&pos);
    cidx = fnt2chr[f_char2int(fidxh,fidxl)];
    if (cidx > -1) {    /* if we have this character */
      cht = &(chartbl[cidx]);
      cht->radidx = radnum(rstr), cht->radtype = radtype(rstr);
      cht->radpos = rad2pos[pos];
      n_ent[cht->radidx][cht->radtype]++;
      /* if we have a remainder, use it, otherwise check if have radical */
      if (fidxh2 != '-') { /* store in both directions */
      if ((cht->remainder = fnt2chr[f_char2int(fidxh2,fidxl2)]) >= 0) {
        chtR = &(chartbl[cht->remainder]);
        if (chtR->remOf.n_entries < MAXREMOF)
          chtR->remOf.ent[chtR->remOf.n_entries++] = cidx;
        else printf("\n*WARNING: too many characters for remainder %d\n",
                  cht->remainder);
        }
      } /* if have remainder */
      } /* if have character */
    }

  fclose(fp);

  /* allocate space for radical character lists */
  for (i=0;i<NRAD;i++)
    for (j=0;j<4;j++)
      if (!(radtbl[i].chars[j].ent = malloc(sizeof(int) * n_ent[i][j])))
        memfail("radical character list");

  /* now go through character list and update radical pointer lists */
  for (i=0;i<nchars;i++) {
    cht = &(chartbl[i]);
    if (cht->radidx >= 0)
      radtbl[cht->radidx].chars[cht->radtype].ent[radtbl[cht->radidx].chars[cht->radtype].n_entries++] = i;
  }

  if (verbose) printf("Bushou data read.\n");

}    


/* function kanhe() reads in triplet database and stores the information;
   it is based on the first part of kanhe */
void
kansanzi(char *datadir)
{
  char            fname[200], line[LINELEN];
  FILE            *fp;
  uchar           fidxh1, fidxl1,     /* font characters */
            fidxh2, fidxl2,
            fidxh3, fidxl3;
  int       fntidx1, fntidx2, fntidx3, /* indices */
            cidx1, cidx2, cidx3,
            freq,
            c = 0;
  char            *tpos;

  fp = Dopen(datadir,"sanzicidianf.gb");

  /* first we read everything into arrays so we can work with it,
   storing what we can in the character table along the way */
  while (getnextline(fp,line, LINELEN)) {
    sscanf(line,"%c%c%c%c%c%c %d",
           &fidxh1,&fidxl1,&fidxh2,&fidxl2,&fidxh3,&fidxl3,&freq);
    tpos = skip(2,line);
    fntidx1 = f_char2int(fidxh1,fidxl1), fntidx2 = f_char2int(fidxh2,fidxl2),
      fntidx3 = f_char2int(fidxh3,fidxl3);
    cidx1 = fnt2chr[fntidx1], cidx2 = fnt2chr[fntidx2],
      cidx3 = fnt2chr[fntidx3];
    if ((cidx1>0)&&(cidx2>0)&&(cidx3>0)) { /* only do if we have all 3 chars */
      triplettbl[c].freq = freq;
      strncpy(triplettbl[c].meaning,tpos, DEFNL-1);
      triplettbl[c].meaning[DEFNL-1] = '\0';  /* make sure null-terminated */
      triplettbl[c].char1idx = cidx1, triplettbl[c].char2idx = cidx2,
        triplettbl[c].char3idx = cidx3;
      c++;
    }
  }

  fclose(fp);
  ntriplets = c;
  if (!quiet) printf("Sanzi: %d triplets read in.\n",ntriplets);

}


/* function fanyi() calls tcl functions to translate our gb and big5
   characters into utf8 encoding so tcl (which apparently *assumes* utf8
   and doesn't document any non-C way of getting around this that I can see
   (#*$%) can print.  Uses (long) function:
    Tcl_ExternalToUtf(interp,encoding,src,srcLen,flags,statePtr,
                  dst,dstLen,srcReadPtr,dstWrotePtr,dstCharsPtr)
   Following this, it fills in the tone-marked final table fin_str_t using
   a conversion formular to gb2312 followed by automatic conversion to utf.
 */  
void
fanyi(Tcl_Interp *interp)
{
  int             i, j, t;
  characterst           *ch;
  char                  gbstr[3], b5str[3], finstr[9], initstr[5], bpmfStr[15];
  /* Tcl aux stuff */
  Tcl_Encoding          b5;
  int             srcRead,dstWroteB,dstWroteC;

  b5 = Tcl_GetEncoding(interp,"big5");
  /*  gb = Tcl_GetEncoding(interp,"gb2312"); -- doesn't work! (&@$%#)
      (there appear to be two versions of gb2312 - see docs) */
  if ((b5 == NULL)) i_error("tcl big5 encoding not installed");

  for (i=0;i<nchars;i++) {
    ch = &(chartbl[i]);
    sprintf(gbstr,"%c%c",f_int2charh(ch->fontidx),f_int2charl(ch->fontidx));
    g2u(gbstr,ch->utf_jh);
    if (ch->big5idx >= 0) {   /* for radicals we have no big5 */

      sprintf(b5str,"%c%c",f_int2charh(ch->big5idx),f_int2charl(ch->big5idx));
      Tcl_ExternalToUtf(interp,b5,b5str,2,0,NULL,ch->utf_ft,4,
                  &srcRead,&dstWroteB,&dstWroteC);

      strcpy(bpmfStr,ch->utf_bpmf);
      Tcl_ExternalToUtf(interp,b5,bpmfStr,strlen(bpmfStr),0,NULL,ch->utf_bpmf,
                        15, &srcRead,&dstWroteB,&dstWroteC);

      }
    }
  Tcl_FreeEncoding(b5);

  /* now we convert finals to diacritically-marked utf strings in two steps:
     first, convert the first character of the final to the gb2312 diacritical
     vowel by formula (168,161+<vowelNum>*4+tone) and the remaining characters
     to gb2312 by (163,<ascii val+128>; then convert the resulting 2-character
     gb specification to 3-character utf. */
  /* 168 = 0xA8, 157 = 0x9D, 128 = 0x80, 163 = 0xA3 */
  for (i=0;i<N_FINALS;i++)
    for (t=0;t<5;t++) {
      /* make gb string */
      if (t < 4)
      finstr[0] = (char) 0xA8,
        finstr[1] = (char) 0x9D + 4*asc2vidx[fin_str[i][0]-'a']+t;
      else  /* 5th tone (neutral) has no diacritical */
      finstr[0] = (char) 0xA3, finstr[1] = fin_str[i][0] + 0x80;
      for (j=1;j<(int)strlen(fin_str[i]);j++)
      finstr[2+(j-1)*2] = (char) 0xA3,
        finstr[3+(j-1)*2] = fin_str[i][j] + 0x80;
      finstr[2+(j-1)*2] = '\0';
      /* convert to utf */
      g2u(finstr,fin_str_t[i][t]);
      }
  /* for simplicity, we also make utf versions of the initials */
  for (i=0;i<N_INITS;i++) {
    for (j=0;j<(int)strlen(init_str[i]);j++)
      initstr[j*2] = (char) 0xA3, initstr[j*2+1] = init_str[i][j] + 0x80;
    initstr[j*2] = '\0';
    g2u(initstr,init_str_t[i]);
    }

}

Generated by  Doxygen 1.6.0   Back to index