|
这些工具是为了提取用户的词库(个人词库)而写的。
1、readPYMB,这个程序读取用户词库的内容,然后显示。输出是这样的:
- PYFAIndex: 7
- HZ: 政
- UserPhraseCount: 1
- +-Length: 2
- | Map: VW
- | Phrase: 府
- | Index: 32169
- | Hit: 8
- PYFAIndex: 7
- HZ: 正
- UserPhraseCount: 2
- +-Length: 2
- | Map: Jb
- | Phrase: 确
- | Index: 30425
- | Hit: 1
- +-Length: 2
- | Map: BB
- | Phrase: 在
- | Index: 36211
- | Hit: 3
复制代码
2、readPYBase,这是我要写mb2org的时候,先写的程序,功能是显示pybase.mb文件的内容。部分输出如下:
- AX: HZ Index
- 抓 27661
- 爪 26844
- 挝 26046
- 摣 25265
- AY: HZ Index
- 拽 27660
- 嘬 26843
- 尵 26045
复制代码
以上例子AX、AY是Map码。
3、mb2org,是最重要的工具,他的功能是由用户词库.mb文件转化为.org文件(输出到stdout)。例如(部分输出):
- zhi'hou 之后
- zhi'jian 之间
- zhi'qian 之前
- zhi'bu'guo 只不过
- zhi'neng 只能
- zhi'shi 只是
- zhi'you 只有
- zhi'yao 只要
复制代码
这些工具的补订如下:(还有,稍微修正了tools/Makefile.am)
- --- fcitx-3.2.1/tools/Makefile.am 2006-07-15 16:16:23.000000000 +0800
- +++ fcitx-3.2.1/tools/Makefile.am 2006-07-16 23:00:45.000000000 +0800
- @@ -1,13 +1,22 @@
- +DEFS = -DPKGDATADIR="$(pkgdatadir)"
- +
- toolsdir = $(pkgdatadir)/tools
- INCLUDES = -I$(top_srcdir)/src -I$(top_srcdir)/lib
-
- -bin_PROGRAMS = mb2txt txt2mb createPYMB jd2fcitx pyjj2fcitx win2fcitx
- +bin_PROGRAMS = mb2txt txt2mb createPYMB jd2fcitx pyjj2fcitx win2fcitx readPYMB readPYBase mb2org
-
- createPYMB_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o
- pyjj2fcitx_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o
- +readPYMB_LDADD = pyTools.o
- +readPYBase_LDADD = pyTools.o
- +mb2org_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o pyTools.o
-
- all:
- ./txt2mb $(srcdir)/../data/cj.txt ../data/cj.mb
- ./txt2mb $(srcdir)/../data/erbi.txt ../data/erbi.mb
- ./txt2mb $(srcdir)/../data/wbx.txt ../data/wbx.mb
- - ./createPYMB $(srcdir)/../data/gbkpy.org $(srcdir)/../data/pyPhrase.org && mv *.mb ../data && rm -f pyERROR ; rm -f pyPhrase.ok
- + ./createPYMB $(srcdir)/../data/gbkpy.org $(srcdir)/../data/pyPhrase.org
- + mv *.mb ../data
- + rm -f pyERROR
- + rm -f pyPhrase.ok
- +
- --- fcitx-3.2.1/tools/mb2org.c 1970-01-01 08:00:00.000000000 +0800
- +++ fcitx-3.2.1/tools/mb2org.c 2006-07-17 01:32:45.000000000 +0800
- @@ -0,0 +1,113 @@
- +#include <stdio.h>
- +#include <string.h>
- +#include <stdlib.h>
- +
- +#include "pyParser.h"
- +#include "pyMapTable.h"
- +#include "PYFA.h"
- +#include "sp.h"
- +#include "pyTools.h"
- +
- +/* Bad programming practice :( */
- +Bool bFullPY;
- +Bool bSingleHZMode;
- +
- +void usage();
- +char *HZToPY(struct _HZMap *, char []);
- +
- +int main(int argc, char **argv)
- +{
- + FILE *fi, *fi2;
- + int i, j, k;
- + char *pyusrphrase_mb, *pybase_mb, *HZPY, tMap[3], tPY[10];
- + struct _HZMap *HZMap;
- + struct _PYMB *PYMB;
- +
- + if (argc > 3)
- + usage();
- +
- + pyusrphrase_mb = getuserfile(PY_USERPHRASE_FILE, (argc > 1) ? argv[1] : "");
- + fi = tryopen(pyusrphrase_mb);
- +
- + pybase_mb = strdup((argc > 2) ? argv[2] : (PKGDATADIR "/data/" PY_BASE_FILE));
- + fi2 = tryopen(pybase_mb);
- +
- + LoadPYMB(fi, &PYMB);
- + LoadPYBase(fi2, &HZMap);
- +
- + for (i = 0; PYMB[i].HZ[0]; ++i)
- + {
- + for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
- + {
- + HZPY = HZToPY(&(HZMap[PYMB[i].PYFAIndex]), PYMB[i].HZ);
- + printf("%s", HZPY);
- +
- + for (k = 0; k < PYMB[i].UserPhrase[j].Length / 2; ++k)
- + {
- + memcpy(tMap, PYMB[i].UserPhrase[j].Map + 2 * k, 2);
- + tMap[2] = '\0';
- + tPY[0] = '\0';
- + if (!MapToPY(tMap, tPY))
- + strcpy(tPY, "'*");
- + printf("'%s", tPY);
- + }
- + printf(" %s%s\n", PYMB[i].HZ, PYMB[i].UserPhrase[j].Phrase);
- +
- + free(HZPY);
- + }
- + printf("\n");
- + }
- +
- + return 0;
- +}
- +
- +/*
- + This function takes a HanZi (HZ) and returns a PinYin (PY) string.
- + If no match is found, "*" is returned.
- +*/
- +
- +char *HZToPY(struct _HZMap *pHZMap1, char HZ[3])
- +{
- + int i;
- + char Map[3], tPY[10];
- +
- + Map[0] = '\0';
- + for (i = 0; i < pHZMap1->BaseCount; ++i)
- + if (memcmp(HZ, pHZMap1->HZ + 2 * i, 2))
- + {
- + strcpy(Map, pHZMap1->Map);
- + break;
- + }
- +
- + if (!Map[0] || !MapToPY(Map, tPY))
- + strcpy(tPY, "*");
- +
- + return strdup(tPY);
- +}
- +
- +void usage()
- +{
- + puts(
- +"mb2org - Convert .mb file to .org file (SEE NOTES BELOW)\n"
- +"\n"
- +" usage: mb2org [<pyusrphrase.mb>] [<pybase.mb>]\n"
- +"\n"
- +" <pyusrphrase.mb> this is the .mb file to be decoded, usually this is\n"
- +" ~/.fcitx/" PY_USERPHRASE_FILE "\n"
- +" if not specified, defaults to\n"
- +" ~/.fcitx/" PY_USERPHRASE_FILE "\n"
- +" <pybase.mb> this is the pybase.mb file used to determine the\n"
- +" of the first character in HZ. Usually, this is\n"
- +" " PKGDATADIR "/data/" PY_BASE_FILE "\n"
- +" if not specified, defaults to\n"
- +" " PKGDATADIR "/data/" PY_BASE_FILE "\n"
- +"\n"
- +"NOTES:\n"
- +"1. If no match is found for a particular HZ, then the pinyin for that HZ\n"
- +" will be `*'.\n"
- +"2. Always check the produced output for errors.\n"
- + );
- + exit(1);
- + return;
- +}
- +
- --- fcitx-3.2.1/tools/pyTools.c 1970-01-01 08:00:00.000000000 +0800
- +++ fcitx-3.2.1/tools/pyTools.c 2006-07-17 01:05:26.000000000 +0800
- @@ -0,0 +1,138 @@
- +#include <stdio.h>
- +#include <stdlib.h>
- +#include <string.h>
- +
- +#include "pyTools.h"
- +
- +void LoadPYMB(FILE *fi, struct _PYMB **pPYMB)
- +{
- + struct _PYMB *PYMB;
- + int i, j, r, n, t, t2;
- +
- + /* Is there a way to avoid reading the whole file twice? */
- +
- + /* First Pass: Determine the size of the PYMB array to be created */
- +
- + n = 0;
- + while (1)
- + {
- + r = fread(&t, sizeof (int), 1, fi);
- + if (!r)
- + break;
- + ++n;
- +
- + fseek(fi, 2, SEEK_CUR);
- + fread(&t, sizeof (int), 1, fi);
- +
- + for (i = 0; i < t; ++i)
- + {
- + fread(&t2, sizeof (int), 1, fi);
- + fseek(fi, 2 * t2 + 2 * sizeof (int), SEEK_CUR);
- + }
- + }
- +
- + /* Second Pass: Actually read the data */
- +
- + fseek(fi, 0, SEEK_SET);
- +
- + *pPYMB = PYMB = malloc(sizeof (*PYMB) * (n + 1));
- +
- + for (i = 0; i < n; ++i)
- + {
- + r = fread(&(PYMB[i].PYFAIndex), sizeof (int), 1, fi);
- +
- + fread(PYMB[i].HZ, sizeof (char) * 2, 1, fi);
- + PYMB[i].HZ[2] = '\0';
- +
- + fread(&(PYMB[i].UserPhraseCount), sizeof (int), 1, fi);
- + PYMB[i].UserPhrase = malloc(sizeof(*(PYMB[i].UserPhrase)) * PYMB[i].UserPhraseCount);
- +
- +#define PU(i,j) (PYMB[(i)].UserPhrase[(j)])
- + for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
- + {
- + fread(&(PU(i,j).Length), sizeof (int), 1, fi);
- +
- + PU(i,j).Map = malloc(sizeof (char) * PU(i,j).Length + 1);
- + fread(PU(i,j).Map, sizeof (char) * PU(i,j).Length, 1, fi);
- + PU(i,j).Map[PU(i,j).Length] = '\0';
- +
- + PU(i,j).Phrase = malloc(sizeof (char) * PU(i,j).Length + 1);
- + fread(PU(i,j).Phrase, sizeof (char) * PU(i,j).Length, 1, fi);
- + PU(i,j).Phrase[PU(i,j).Length] = '\0';
- +
- + fread(&(PU(i,j).Index), sizeof (int), 1, fi);
- +
- + fread(&(PU(i,j).Hit), sizeof (int), 1, fi);
- + }
- +#undef PU
- + }
- + PYMB[n].HZ[0] = '\0';
- +
- + return;
- +}
- +
- +int LoadPYBase(FILE *fi, struct _HZMap **pHZMap)
- +{
- + int i, j, r, PYFACount;
- + struct _HZMap *HZMap;
- +
- + r = fread(&PYFACount, sizeof (int), 1, fi);
- + if (!r)
- + return 0;
- +
- + *pHZMap = HZMap = malloc(sizeof (*HZMap) * (PYFACount + 1));
- + for (i = 0; i < PYFACount; ++i)
- + {
- + fread(HZMap[i].Map, 2, 1, fi);
- + HZMap[i].Map[2] = '\0';
- +
- + fread(&(HZMap[i].BaseCount), sizeof (int), 1, fi);
- + HZMap[i].HZ = malloc(2 * HZMap[i].BaseCount);
- + HZMap[i].Index = malloc(sizeof (int) * HZMap[i].BaseCount);
- +
- + for (j = 0; j < HZMap[i].BaseCount; ++j)
- + {
- + fread(HZMap[i].HZ + j * 2, 2, 1, fi);
- + fread(HZMap[i].Index + j, sizeof (int), 1, fi);
- + }
- + }
- + HZMap[i].Map[0] = '\0';
- +
- + return PYFACount;
- +}
- +
- +FILE *tryopen(char *filename)
- +{
- + FILE *fi;
- +
- + fi = fopen(filename, "r");
- + if (!fi)
- + {
- + perror("fopen");
- + fprintf(stderr, "Can't open file `%s' for reading\n", filename);
- + exit(1);
- + }
- +
- + return fi;
- +}
- +
- +char *getuserfile(char *name, char *given)
- +{
- + char *filename, *home;
- +
- + if (given[0])
- + filename = strdup(given);
- + else
- + {
- + home = getenv("HOME");
- + if (!home)
- + home = strdup("~");
- + filename = malloc(strlen(home) + strlen("/.fcitx/") + strlen(name) + 1);
- + strcpy(filename, home);
- + strcat(filename, "/.fcitx/");
- + strcat(filename, name);
- + }
- +
- + return filename;
- +}
- +
- --- fcitx-3.2.1/tools/pyTools.h 1970-01-01 08:00:00.000000000 +0800
- +++ fcitx-3.2.1/tools/pyTools.h 2006-07-17 01:05:43.000000000 +0800
- @@ -0,0 +1,35 @@
- +#ifndef _PY_TOOLS_H
- +#define _PY_TOOLS_H
- +
- +struct _PYMB
- +{
- + int PYFAIndex;
- + char HZ[3];
- + int UserPhraseCount;
- + struct
- + {
- + int Length;
- + char *Map;
- + char *Phrase;
- + int Index;
- + int Hit;
- + } *UserPhrase;
- +};
- +
- +struct _HZMap
- +{
- + char Map[3];
- + int BaseCount;
- + char *HZ;
- + int *Index;
- +};
- +
- +int LoadPYBase(FILE *, struct _HZMap **);
- +void LoadPYMB(FILE *, struct _PYMB **);
- +
- +char *getuserfile(char *, char *);
- +
- +FILE *tryopen(char *);
- +
- +#endif /* _PY_TOOLS_H */
- +
- --- fcitx-3.2.1/tools/readPYBase.c 1970-01-01 08:00:00.000000000 +0800
- +++ fcitx-3.2.1/tools/readPYBase.c 2006-07-17 00:33:25.000000000 +0800
- @@ -0,0 +1,66 @@
- +#include <stdio.h>
- +
- +#include "py.h"
- +#include "pyTools.h"
- +
- +void usage();
- +
- +int main(int argc, char **argv)
- +{
- + FILE *fi;
- + int i, PYFACount;
- + char *pybase_mb;
- + struct _HZMap *HZMap;
- +
- + if (argc > 2)
- + usage();
- +
- + pybase_mb = strdup((argc > 1) ? argv[1] : (PKGDATADIR "/data/" PY_BASE_FILE));
- + fi = tryopen(pybase_mb);
- +
- + PYFACount = LoadPYBase(fi, &HZMap);
- + if (PYFACount > 0)
- + {
- +#if 0
- + for (i = 0; i < PYFACount; ++i)
- + {
- + printf("%s: ", HZMap[i].Map);
- + fwrite(HZMap[i].HZ, 2, HZMap[i].BaseCount, stdout);
- + printf("\n\n");
- + }
- +#else
- + for (i = 0; i < PYFACount; ++i)
- + {
- + int j;
- + printf("%s: HZ Index\n", HZMap[i].Map);
- + for (j = 0; j < HZMap[i].BaseCount / 2; ++j)
- + {
- + printf(" ");
- + fwrite(HZMap[i].HZ + 2 * j, 2, 1, stdout);
- + printf(" %5d\n", *(HZMap[i].Index + 2 * j));
- + }
- + printf("\n");
- + }
- +#endif
- + }
- +
- + return 0;
- +}
- +
- +void usage()
- +{
- + puts(
- +"readPYBase - read pybase.mb file and display its contents\n"
- +"\n"
- +" usage: readPYBase [<pybase.mb>]\n"
- +"\n"
- +" <pybase.mb> full path to the file, usually\n"
- +" " PKGDATADIR "/data/" PY_BASE_FILE "\n"
- +" if not specified, defaults to\n"
- +" " PKGDATADIR "/data/" PY_BASE_FILE "\n"
- +"\n"
- + );
- + exit(1);
- + return;
- +}
- +
- --- fcitx-3.2.1/tools/readPYMB.c 1970-01-01 08:00:00.000000000 +0800
- +++ fcitx-3.2.1/tools/readPYMB.c 2006-07-17 01:04:31.000000000 +0800
- @@ -0,0 +1,61 @@
- +#include <stdio.h>
- +#include <stdlib.h>
- +
- +#include "py.h"
- +#include "pyTools.h"
- +
- +void usage();
- +
- +int main(int argc, char **argv)
- +{
- + FILE *fi;
- + int i, j;
- + char *pyusrphrase_mb;
- + struct _PYMB *PYMB;
- +
- + if (argc > 3)
- + usage();
- +
- + pyusrphrase_mb = getuserfile(PY_USERPHRASE_FILE, (argc > 1) ? argv[1] : "");
- + fi = tryopen(pyusrphrase_mb);
- + LoadPYMB(fi, &PYMB);
- +
- + for (i = 0; PYMB[i].HZ[0]; ++i)
- + {
- + printf("PYFAIndex: %d\n", PYMB[i].PYFAIndex);
- + printf("HZ: %s\n", PYMB[i].HZ);
- + printf("UserPhraseCount: %d\n", PYMB[i].UserPhraseCount);
- +
- + for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
- + {
- + printf("+-Length: %d\n", PYMB[i].UserPhrase[j].Length);
- + printf("| Map: %s\n", PYMB[i].UserPhrase[j].Map);
- + printf("| Phrase: %s\n", PYMB[i].UserPhrase[j].Phrase);
- + printf("| Index: %d\n", PYMB[i].UserPhrase[j].Index);
- + printf("| Hit: %d\n", PYMB[i].UserPhrase[j].Hit);
- + }
- + printf("\n");
- + }
- +
- + return 0;
- +}
- +
- +void usage()
- +{
- + puts(
- +"readPYMB - read data from a pinyin .mb file and display its meaning\n"
- +"\n"
- +" usage: readPYMB <mbfile>\n"
- +"\n"
- +" <mbfile> MB (MaBiao) file to be read, usually this is\n"
- +" ~/.fcitx/" PY_USERPHRASE_FILE "\n"
- +" if not specified, defaults to\n"
- +" ~/.fcitx/" PY_USERPHRASE_FILE "\n"
- +"\n"
- +" The MB file can either be a user's MB file (~/.fcitx/pyuserphrase.mb),\n"
- +" or the system phrase pinyin MB file (/usr/share/fcitx/data/pyphrase.mb.\n"
- + );
- + exit(1);
- + return;
- +}
- +
复制代码
我的代码风格和Yuking的不太一样,呵呵~
希望会接受~ |
|