LinuxSir.cn,穿越时空的Linuxsir!

 找回密码
 注册
搜索
热搜: shell linux mysql
查看: 911|回复: 2

fcitx 3.2.1补订,曾加三个新工具:readPYMB、readPYBase、mb2org

[复制链接]
发表于 2006-7-17 01:58:50 | 显示全部楼层 |阅读模式
这些工具是为了提取用户的词库(个人词库)而写的。

1、readPYMB,这个程序读取用户词库的内容,然后显示。输出是这样的:
  1. PYFAIndex: 7
  2. HZ: 政
  3. UserPhraseCount: 1
  4. +-Length: 2
  5. | Map: VW
  6. | Phrase: 府
  7. | Index: 32169
  8. | Hit: 8

  9. PYFAIndex: 7
  10. HZ: 正
  11. UserPhraseCount: 2
  12. +-Length: 2
  13. | Map: Jb
  14. | Phrase: 确
  15. | Index: 30425
  16. | Hit: 1
  17. +-Length: 2
  18. | Map: BB
  19. | Phrase: 在
  20. | Index: 36211
  21. | Hit: 3
复制代码


2、readPYBase,这是我要写mb2org的时候,先写的程序,功能是显示pybase.mb文件的内容。部分输出如下:
  1. AX: HZ Index
  2.     抓 27661
  3.     爪 26844
  4.     挝 26046
  5.     摣 25265

  6. AY: HZ Index
  7.     拽 27660
  8.     嘬 26843
  9.     尵 26045
复制代码

以上例子AX、AY是Map码。

3、mb2org,是最重要的工具,他的功能是由用户词库.mb文件转化为.org文件(输出到stdout)。例如(部分输出):
  1. zhi'hou 之后
  2. zhi'jian 之间
  3. zhi'qian 之前

  4. zhi'bu'guo 只不过
  5. zhi'neng 只能
  6. zhi'shi 只是
  7. zhi'you 只有
  8. zhi'yao 只要
复制代码


这些工具的补订如下:(还有,稍微修正了tools/Makefile.am)
  1. --- fcitx-3.2.1/tools/Makefile.am        2006-07-15 16:16:23.000000000 +0800
  2. +++ fcitx-3.2.1/tools/Makefile.am        2006-07-16 23:00:45.000000000 +0800
  3. @@ -1,13 +1,22 @@
  4. +DEFS = -DPKGDATADIR="$(pkgdatadir)"
  5. +
  6. toolsdir = $(pkgdatadir)/tools
  7. INCLUDES = -I$(top_srcdir)/src -I$(top_srcdir)/lib

  8. -bin_PROGRAMS = mb2txt txt2mb createPYMB jd2fcitx pyjj2fcitx win2fcitx
  9. +bin_PROGRAMS = mb2txt txt2mb createPYMB jd2fcitx pyjj2fcitx win2fcitx readPYMB readPYBase mb2org

  10. createPYMB_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o
  11. pyjj2fcitx_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o
  12. +readPYMB_LDADD = pyTools.o
  13. +readPYBase_LDADD = pyTools.o
  14. +mb2org_LDADD = ../src/pyParser.o ../src/pyMapTable.o ../src/PYFA.o ../src/sp.o pyTools.o

  15. all:
  16.         ./txt2mb $(srcdir)/../data/cj.txt ../data/cj.mb
  17.         ./txt2mb $(srcdir)/../data/erbi.txt ../data/erbi.mb
  18.         ./txt2mb $(srcdir)/../data/wbx.txt ../data/wbx.mb
  19. -        ./createPYMB $(srcdir)/../data/gbkpy.org $(srcdir)/../data/pyPhrase.org && mv *.mb ../data && rm -f pyERROR ; rm -f pyPhrase.ok
  20. +        ./createPYMB $(srcdir)/../data/gbkpy.org $(srcdir)/../data/pyPhrase.org
  21. +        mv *.mb ../data
  22. +        rm -f pyERROR
  23. +        rm -f pyPhrase.ok
  24. +
  25. --- fcitx-3.2.1/tools/mb2org.c        1970-01-01 08:00:00.000000000 +0800
  26. +++ fcitx-3.2.1/tools/mb2org.c        2006-07-17 01:32:45.000000000 +0800
  27. @@ -0,0 +1,113 @@
  28. +#include <stdio.h>
  29. +#include <string.h>
  30. +#include <stdlib.h>
  31. +
  32. +#include "pyParser.h"
  33. +#include "pyMapTable.h"
  34. +#include "PYFA.h"
  35. +#include "sp.h"
  36. +#include "pyTools.h"
  37. +
  38. +/* Bad programming practice :( */
  39. +Bool bFullPY;
  40. +Bool bSingleHZMode;
  41. +
  42. +void usage();
  43. +char *HZToPY(struct _HZMap *, char []);
  44. +
  45. +int main(int argc, char **argv)
  46. +{
  47. +  FILE *fi, *fi2;
  48. +  int i, j, k;
  49. +  char *pyusrphrase_mb, *pybase_mb, *HZPY, tMap[3], tPY[10];
  50. +  struct _HZMap *HZMap;
  51. +  struct _PYMB *PYMB;
  52. +
  53. +  if (argc > 3)
  54. +    usage();
  55. +
  56. +  pyusrphrase_mb = getuserfile(PY_USERPHRASE_FILE, (argc > 1) ? argv[1] : "");
  57. +  fi = tryopen(pyusrphrase_mb);
  58. +
  59. +  pybase_mb = strdup((argc > 2) ? argv[2] : (PKGDATADIR "/data/" PY_BASE_FILE));
  60. +  fi2 = tryopen(pybase_mb);
  61. +
  62. +  LoadPYMB(fi, &PYMB);
  63. +  LoadPYBase(fi2, &HZMap);
  64. +
  65. +  for (i = 0; PYMB[i].HZ[0]; ++i)
  66. +  {
  67. +    for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
  68. +    {
  69. +      HZPY = HZToPY(&(HZMap[PYMB[i].PYFAIndex]), PYMB[i].HZ);
  70. +      printf("%s", HZPY);
  71. +
  72. +      for (k = 0; k < PYMB[i].UserPhrase[j].Length / 2; ++k)
  73. +      {
  74. +        memcpy(tMap, PYMB[i].UserPhrase[j].Map + 2 * k, 2);
  75. +        tMap[2] = '\0';
  76. +        tPY[0] = '\0';
  77. +        if (!MapToPY(tMap, tPY))
  78. +          strcpy(tPY, "'*");
  79. +        printf("'%s", tPY);
  80. +      }
  81. +      printf(" %s%s\n", PYMB[i].HZ, PYMB[i].UserPhrase[j].Phrase);
  82. +
  83. +      free(HZPY);
  84. +    }
  85. +    printf("\n");
  86. +  }
  87. +
  88. +  return 0;
  89. +}
  90. +
  91. +/*
  92. +  This function takes a HanZi (HZ) and returns a PinYin (PY) string.
  93. +  If no match is found, "*" is returned.
  94. +*/
  95. +
  96. +char *HZToPY(struct _HZMap *pHZMap1, char HZ[3])
  97. +{
  98. +  int i;
  99. +  char Map[3], tPY[10];
  100. +
  101. +  Map[0] = '\0';
  102. +  for (i = 0; i < pHZMap1->BaseCount; ++i)
  103. +    if (memcmp(HZ, pHZMap1->HZ + 2 * i, 2))
  104. +    {
  105. +      strcpy(Map, pHZMap1->Map);
  106. +      break;
  107. +    }
  108. +
  109. +  if (!Map[0] || !MapToPY(Map, tPY))
  110. +    strcpy(tPY, "*");
  111. +
  112. +  return strdup(tPY);
  113. +}
  114. +
  115. +void usage()
  116. +{
  117. +  puts(
  118. +"mb2org - Convert .mb file to .org file (SEE NOTES BELOW)\n"
  119. +"\n"
  120. +"  usage: mb2org [<pyusrphrase.mb>] [<pybase.mb>]\n"
  121. +"\n"
  122. +"  <pyusrphrase.mb>   this is the .mb file to be decoded, usually this is\n"
  123. +"                     ~/.fcitx/" PY_USERPHRASE_FILE "\n"
  124. +"                     if not specified, defaults to\n"
  125. +"                     ~/.fcitx/" PY_USERPHRASE_FILE "\n"
  126. +"  <pybase.mb>        this is the pybase.mb file used to determine the\n"
  127. +"                     of the first character in HZ. Usually, this is\n"
  128. +"                     " PKGDATADIR "/data/" PY_BASE_FILE "\n"
  129. +"                     if not specified, defaults to\n"
  130. +"                     " PKGDATADIR "/data/" PY_BASE_FILE "\n"
  131. +"\n"
  132. +"NOTES:\n"
  133. +"1. If no match is found for a particular HZ, then the pinyin for that HZ\n"
  134. +"   will be `*'.\n"
  135. +"2. Always check the produced output for errors.\n"
  136. +  );
  137. +  exit(1);
  138. +  return;
  139. +}
  140. +
  141. --- fcitx-3.2.1/tools/pyTools.c        1970-01-01 08:00:00.000000000 +0800
  142. +++ fcitx-3.2.1/tools/pyTools.c        2006-07-17 01:05:26.000000000 +0800
  143. @@ -0,0 +1,138 @@
  144. +#include <stdio.h>
  145. +#include <stdlib.h>
  146. +#include <string.h>
  147. +
  148. +#include "pyTools.h"
  149. +
  150. +void LoadPYMB(FILE *fi, struct _PYMB **pPYMB)
  151. +{
  152. +  struct _PYMB *PYMB;
  153. +  int i, j, r, n, t, t2;
  154. +
  155. +  /* Is there a way to avoid reading the whole file twice? */
  156. +
  157. +  /* First Pass: Determine the size of the PYMB array to be created */
  158. +
  159. +  n = 0;
  160. +  while (1)
  161. +  {
  162. +    r = fread(&t, sizeof (int), 1, fi);
  163. +    if (!r)
  164. +      break;
  165. +    ++n;
  166. +
  167. +    fseek(fi, 2, SEEK_CUR);
  168. +    fread(&t, sizeof (int), 1, fi);
  169. +
  170. +    for (i = 0; i < t; ++i)
  171. +    {
  172. +      fread(&t2, sizeof (int), 1, fi);
  173. +      fseek(fi, 2 * t2 + 2 * sizeof (int), SEEK_CUR);
  174. +    }
  175. +  }
  176. +
  177. +  /* Second Pass: Actually read the data */
  178. +
  179. +  fseek(fi, 0, SEEK_SET);
  180. +
  181. +  *pPYMB = PYMB = malloc(sizeof (*PYMB) * (n + 1));
  182. +
  183. +  for (i = 0; i < n; ++i)
  184. +  {
  185. +    r = fread(&(PYMB[i].PYFAIndex), sizeof (int), 1, fi);
  186. +
  187. +    fread(PYMB[i].HZ, sizeof (char) * 2, 1, fi);
  188. +    PYMB[i].HZ[2] = '\0';
  189. +
  190. +    fread(&(PYMB[i].UserPhraseCount), sizeof (int), 1, fi);
  191. +    PYMB[i].UserPhrase = malloc(sizeof(*(PYMB[i].UserPhrase)) * PYMB[i].UserPhraseCount);
  192. +
  193. +#define PU(i,j) (PYMB[(i)].UserPhrase[(j)])
  194. +    for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
  195. +    {
  196. +      fread(&(PU(i,j).Length), sizeof (int), 1, fi);
  197. +
  198. +      PU(i,j).Map = malloc(sizeof (char) * PU(i,j).Length + 1);
  199. +      fread(PU(i,j).Map, sizeof (char) * PU(i,j).Length, 1, fi);
  200. +      PU(i,j).Map[PU(i,j).Length] = '\0';
  201. +
  202. +      PU(i,j).Phrase = malloc(sizeof (char) * PU(i,j).Length + 1);
  203. +      fread(PU(i,j).Phrase, sizeof (char) * PU(i,j).Length, 1, fi);
  204. +      PU(i,j).Phrase[PU(i,j).Length] = '\0';
  205. +
  206. +      fread(&(PU(i,j).Index), sizeof (int), 1, fi);
  207. +
  208. +      fread(&(PU(i,j).Hit), sizeof (int), 1, fi);
  209. +    }
  210. +#undef PU
  211. +  }
  212. +  PYMB[n].HZ[0] = '\0';
  213. +
  214. +  return;
  215. +}
  216. +
  217. +int LoadPYBase(FILE *fi, struct _HZMap **pHZMap)
  218. +{
  219. +  int i, j, r, PYFACount;
  220. +  struct _HZMap *HZMap;
  221. +
  222. +  r = fread(&PYFACount, sizeof (int), 1, fi);
  223. +  if (!r)
  224. +    return 0;
  225. +
  226. +  *pHZMap = HZMap = malloc(sizeof (*HZMap) * (PYFACount + 1));
  227. +  for (i = 0; i < PYFACount; ++i)
  228. +  {
  229. +    fread(HZMap[i].Map, 2, 1, fi);
  230. +    HZMap[i].Map[2] = '\0';
  231. +
  232. +    fread(&(HZMap[i].BaseCount), sizeof (int), 1, fi);
  233. +    HZMap[i].HZ = malloc(2 * HZMap[i].BaseCount);
  234. +    HZMap[i].Index = malloc(sizeof (int) * HZMap[i].BaseCount);
  235. +
  236. +    for (j = 0; j < HZMap[i].BaseCount; ++j)
  237. +    {
  238. +      fread(HZMap[i].HZ + j * 2, 2, 1, fi);
  239. +      fread(HZMap[i].Index + j, sizeof (int), 1, fi);
  240. +    }
  241. +  }
  242. +  HZMap[i].Map[0] = '\0';
  243. +
  244. +  return PYFACount;
  245. +}
  246. +
  247. +FILE *tryopen(char *filename)
  248. +{
  249. +  FILE *fi;
  250. +
  251. +  fi = fopen(filename, "r");
  252. +  if (!fi)
  253. +  {
  254. +    perror("fopen");
  255. +    fprintf(stderr, "Can't open file `%s' for reading\n", filename);
  256. +    exit(1);
  257. +  }
  258. +
  259. +  return fi;
  260. +}
  261. +
  262. +char *getuserfile(char *name, char *given)
  263. +{
  264. +  char *filename, *home;
  265. +
  266. +  if (given[0])
  267. +    filename = strdup(given);
  268. +  else
  269. +  {
  270. +    home = getenv("HOME");
  271. +    if (!home)
  272. +      home = strdup("~");
  273. +    filename = malloc(strlen(home) + strlen("/.fcitx/") + strlen(name) + 1);
  274. +    strcpy(filename, home);
  275. +    strcat(filename, "/.fcitx/");
  276. +    strcat(filename, name);
  277. +  }
  278. +
  279. +  return filename;
  280. +}
  281. +
  282. --- fcitx-3.2.1/tools/pyTools.h        1970-01-01 08:00:00.000000000 +0800
  283. +++ fcitx-3.2.1/tools/pyTools.h        2006-07-17 01:05:43.000000000 +0800
  284. @@ -0,0 +1,35 @@
  285. +#ifndef _PY_TOOLS_H
  286. +#define _PY_TOOLS_H
  287. +
  288. +struct _PYMB
  289. +{
  290. +  int PYFAIndex;
  291. +  char HZ[3];
  292. +  int UserPhraseCount;
  293. +  struct
  294. +  {
  295. +    int Length;
  296. +    char *Map;
  297. +    char *Phrase;
  298. +    int Index;
  299. +    int Hit;
  300. +  } *UserPhrase;
  301. +};
  302. +
  303. +struct _HZMap
  304. +{
  305. +  char Map[3];
  306. +  int BaseCount;
  307. +  char *HZ;
  308. +  int *Index;
  309. +};
  310. +
  311. +int LoadPYBase(FILE *, struct _HZMap **);
  312. +void LoadPYMB(FILE *, struct _PYMB **);
  313. +
  314. +char *getuserfile(char *, char *);
  315. +
  316. +FILE *tryopen(char *);
  317. +
  318. +#endif /* _PY_TOOLS_H */
  319. +
  320. --- fcitx-3.2.1/tools/readPYBase.c        1970-01-01 08:00:00.000000000 +0800
  321. +++ fcitx-3.2.1/tools/readPYBase.c        2006-07-17 00:33:25.000000000 +0800
  322. @@ -0,0 +1,66 @@
  323. +#include <stdio.h>
  324. +
  325. +#include "py.h"
  326. +#include "pyTools.h"
  327. +
  328. +void usage();
  329. +
  330. +int main(int argc, char **argv)
  331. +{
  332. +  FILE *fi;
  333. +  int i, PYFACount;
  334. +  char *pybase_mb;
  335. +  struct _HZMap *HZMap;
  336. +
  337. +  if (argc > 2)
  338. +    usage();
  339. +
  340. +  pybase_mb = strdup((argc > 1) ? argv[1] : (PKGDATADIR "/data/" PY_BASE_FILE));
  341. +  fi = tryopen(pybase_mb);
  342. +
  343. +  PYFACount = LoadPYBase(fi, &HZMap);
  344. +  if (PYFACount > 0)
  345. +  {
  346. +#if 0
  347. +    for (i = 0; i < PYFACount; ++i)
  348. +    {
  349. +      printf("%s: ", HZMap[i].Map);
  350. +      fwrite(HZMap[i].HZ, 2, HZMap[i].BaseCount, stdout);
  351. +      printf("\n\n");
  352. +    }
  353. +#else
  354. +    for (i = 0; i < PYFACount; ++i)
  355. +    {
  356. +      int j;
  357. +      printf("%s: HZ Index\n", HZMap[i].Map);
  358. +      for (j = 0; j < HZMap[i].BaseCount / 2; ++j)
  359. +      {
  360. +        printf("    ");
  361. +        fwrite(HZMap[i].HZ + 2 * j, 2, 1, stdout);
  362. +        printf(" %5d\n", *(HZMap[i].Index + 2 * j));
  363. +      }
  364. +      printf("\n");
  365. +    }
  366. +#endif
  367. +  }
  368. +
  369. +  return 0;
  370. +}
  371. +
  372. +void usage()
  373. +{
  374. +  puts(
  375. +"readPYBase - read pybase.mb file and display its contents\n"
  376. +"\n"
  377. +"  usage: readPYBase [<pybase.mb>]\n"
  378. +"\n"
  379. +"  <pybase.mb>    full path to the file, usually\n"
  380. +"                 " PKGDATADIR "/data/" PY_BASE_FILE "\n"
  381. +"                 if not specified, defaults to\n"
  382. +"                 " PKGDATADIR "/data/" PY_BASE_FILE "\n"
  383. +"\n"
  384. +  );
  385. +  exit(1);
  386. +  return;
  387. +}
  388. +
  389. --- fcitx-3.2.1/tools/readPYMB.c        1970-01-01 08:00:00.000000000 +0800
  390. +++ fcitx-3.2.1/tools/readPYMB.c        2006-07-17 01:04:31.000000000 +0800
  391. @@ -0,0 +1,61 @@
  392. +#include <stdio.h>
  393. +#include <stdlib.h>
  394. +
  395. +#include "py.h"
  396. +#include "pyTools.h"
  397. +
  398. +void usage();
  399. +
  400. +int main(int argc, char **argv)
  401. +{
  402. +  FILE *fi;
  403. +  int i, j;
  404. +  char *pyusrphrase_mb;
  405. +  struct _PYMB *PYMB;
  406. +
  407. +  if (argc > 3)
  408. +    usage();
  409. +
  410. +  pyusrphrase_mb = getuserfile(PY_USERPHRASE_FILE, (argc > 1) ? argv[1] : "");
  411. +  fi = tryopen(pyusrphrase_mb);
  412. +  LoadPYMB(fi, &PYMB);
  413. +
  414. +  for (i = 0; PYMB[i].HZ[0]; ++i)
  415. +  {
  416. +    printf("PYFAIndex: %d\n", PYMB[i].PYFAIndex);
  417. +    printf("HZ: %s\n", PYMB[i].HZ);
  418. +    printf("UserPhraseCount: %d\n", PYMB[i].UserPhraseCount);
  419. +
  420. +    for (j = 0; j < PYMB[i].UserPhraseCount; ++j)
  421. +    {
  422. +      printf("+-Length: %d\n", PYMB[i].UserPhrase[j].Length);
  423. +      printf("| Map: %s\n", PYMB[i].UserPhrase[j].Map);
  424. +      printf("| Phrase: %s\n", PYMB[i].UserPhrase[j].Phrase);
  425. +      printf("| Index: %d\n", PYMB[i].UserPhrase[j].Index);
  426. +      printf("| Hit: %d\n", PYMB[i].UserPhrase[j].Hit);
  427. +    }
  428. +    printf("\n");
  429. +  }
  430. +
  431. +  return 0;
  432. +}
  433. +
  434. +void usage()
  435. +{
  436. +  puts(
  437. +"readPYMB - read data from a pinyin .mb file and display its meaning\n"
  438. +"\n"
  439. +"  usage: readPYMB <mbfile>\n"
  440. +"\n"
  441. +"  <mbfile>    MB (MaBiao) file to be read, usually this is\n"
  442. +"              ~/.fcitx/" PY_USERPHRASE_FILE "\n"
  443. +"              if not specified, defaults to\n"
  444. +"              ~/.fcitx/" PY_USERPHRASE_FILE "\n"
  445. +"\n"
  446. +"  The MB file can either be a user's MB file (~/.fcitx/pyuserphrase.mb),\n"
  447. +"  or the system phrase pinyin MB file (/usr/share/fcitx/data/pyphrase.mb.\n"
  448. +  );
  449. +  exit(1);
  450. +  return;
  451. +}
  452. +
复制代码


我的代码风格和Yuking的不太一样,呵呵~
希望会接受~
 楼主| 发表于 2006-7-17 02:46:48 | 显示全部楼层
漏了一个补订(不然的话编译时会出错):
  1. --- fcitx-3.2.1/Makefile.am        2006-06-06 22:16:37.000000000 +0800
  2. +++ fcitx-3.2.1/Makefile.am        2006-07-17 02:26:37.000000000 +0800
  3. @@ -1,5 +1,5 @@
  4. AUTOMAKE_OPTIONS = foreign
  5. -SUBDIRS = doc tools data debian xpm lib src
  6. +SUBDIRS = doc debian xpm lib src tools data
  7. EXTRA_DIST = autogen.sh fcitx.spec

  8. MAINTAINERCLEANFILES = \
复制代码
回复 支持 反对

使用道具 举报

发表于 2006-7-17 08:43:53 | 显示全部楼层
OK,òÏ 入所有的补丁
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 注册

本版积分规则

快速回复 返回顶部 返回列表