|

楼主 |
发表于 2006-6-17 03:57:43
|
显示全部楼层
工具也发一下喽。
用法:./fc.py /usr/share/fcitx/data/wbx.mb wbchen.txt wbchen.mb
运行条件:
linux
python2.4
已安装fcitx
有mb2txt及txt2mb命令
说明:
第一个参数是已有码表文件,程序运行会在此文件中查找单字的五笔编码。
第二个参数是用“;”分隔的词语,所有文件只有一行,注意此文件中不能含有在第一参数中找不到编码的字,如“阿Q”,这里的“Q”是全角的,在默认的五笔型中没有对应的编码,另外此文件编码请转成utf8.
第三个参数就是根据提供的词语生成的新词库。
应用:
如果已经运行一次,如生成了wbchen.mb,而现在又有新词库(如new.txt)想加入,可以如下再运行一次:
./fc.py wbchen.mb new.txt new.mb
fc.py源代码:
[PHP]
#!/usr/bin/env python2.4
# -*- encoding: utf8 -*-
# Filename: fc.py
# Discripion:
# Author(s): yetist
# Version:
import sys
import os
import time
import string
def mb2txt(src,dst="wbx_tmp.txt"):
"""
transfor wbx.mb to wbx_tmp.txt"""
cmd="mb2txt %s > %s " %(src,dst)
stat=os.system(cmd)
if stat== 0:
print "mb to txt \033[32mok \033[0m"
fp=open(dst)
buf=fp.readlines()
fp.close()
#os.remove(dst)
bianmaindex=[]
for i in buf:
if i.find("=") >=0 or i.startswith("["):
buf.remove(i)
continue
bianma=unicode(i,"gb18030".encode("utf-8"))
#bianmaindex.append(bianma.split())
bianmaindex.append(bianma)
return bianmaindex
def get_danzi_mabian(allmabian):
bianmaindex=[]
for n in allmabian:
i=n.split()
if len(i[1]) ==1 and len(i[0]) >=2:
bianmaindex.append(i)
return bianmaindex
def split_chenhu(srcfile):
cizu=[]
fp=open(srcfile)
buf=fp.read()
cc=buf.split(";")
fp.close()
for i in cc:
m=string.strip(i.decode("utf8"))
if len(m) ==1:
continue
cizu.append(m)
return cizu
def sub_cizu_from_mabiao(cizulist,dst="wbx_tmp.txt"):
newcizu=[]
fp=open(dst)
buf=fp.read()
fp.close()
str=unicode(buf,"gb18030".encode("utf-8"))
for i in cizulist:
if str.find(i) <0:
newcizu.append(i)
return newcizu
def make_mabiao(danzi_mabiao,cizulist):
mabiaolist=[]
for n in cizulist:
ciy=[]
for p in range(0,len(n)):
for i in danzi_mabiao:
if i[1] == n[p]:
ciy.append(i[0])
ciy.append(n[p])
break
if len(n) == 2:
strings=ciy[0][:2]+ciy[2][:2]+" "+ciy[1]+ciy[3]+"\n"
mabiaolist.append(strings)
if len(n) == 3:
strings=ciy[0][:1]+ciy[2][:1]+ciy[4][:2]+" "+ciy[1]+ciy[3]+ciy[5]+"\n"
mabiaolist.append(strings)
if len(n) == 4:
strings=ciy[0][:1]+ciy[2][:1]+ciy[4][:1]+ciy[6][:1]+" "+ciy[1]+ciy[3]+ciy[5]+ciy[7]+"\n"
mabiaolist.append(strings)
if len(n) > 4:
length= len(n)
mm=0
zifu=''
for mm in range(0,length):
zifu+=ciy[2*mm+1]
strings=ciy[0][:1]+ciy[2][:1]+ciy[4][:1]+ciy[2*length-2][:1]+" "+zifu+"\n"
#mabiaolist.append(strings.split())
mabiaolist.append(strings)
return mabiaolist
if __name__ == "__main__":
a=time.time()
mbsource=sys.argv[1]
src=sys.argv[2]
dst=sys.argv[3]
tmpfile="wbx_tmp.txt"
allmabiao=mb2txt(mbsource,tmpfile)
print "get all mabiao \033[32mok \033[0m"
allcizu=split_chenhu(src)
print "get cizulist from chenhu \033[32mok \033[0m"
#sys.exit()
cizulist=sub_cizu_from_mabiao(allcizu,tmpfile)
#sys.exit()
danzi=get_danzi_mabian(allmabiao)
print "get danzi mabiao \033[32mok \033[0m"
cizumabiao=make_mabiao(danzi,cizulist)
print "get cizu mabiao \033[32mok \033[0m"
allmabiao.extend(cizumabiao)
allmabiao.sort()
print len(allmabiao)
fp=open(tmpfile,"w")
str=u'键码=abcdefghijklmnopqrstuvwxy\n码长=4\n[组词规则]\ne2=p11+p12+p21+p22\ne3=p11+p21+p31+p32\na4=p11+p21+p31+n11\n[数据]\n'.encode("gb18030")
fp.write(str)
for i in allmabiao:
fp.write(i.encode("gb18030"))
fp.close()
cmd="txt2mb %s %s " % (tmpfile, dst)
if os.system(cmd) == 0:
print "ok,your new mb file is :%s" % dst
os.remove(tmpfile)
b=time.time()
print "time:",b-a
[/PHP] |
|