| |
@@ -1470,6 +1470,134 @@
|
| |
- "description": _("Japanese direct input method developed by Hajime Ohiwa and Takaaki Takashima <http://www.crew.sfc.keio.ac.jp/~chk/intro.html>"),
|
| |
- "priority": 10
|
| |
-}
|
| |
+ diff --git a/data/templates/libkkc-data/tools/genfilter.py b/data/templates/libkkc-data/tools/genfilter.py
|
| |
+ index 5ffab32..0c5f75a 100644
|
| |
+ --- a/data/templates/libkkc-data/tools/genfilter.py
|
| |
+ +++ b/data/templates/libkkc-data/tools/genfilter.py
|
| |
+ @@ -84,24 +84,24 @@ class FilterGenerator(object):
|
| |
+
|
| |
+ def generate(self):
|
| |
+ size = os.fstat(self.infile.fileno()).st_size
|
| |
+ - n = size / self.record_size
|
| |
+ + n = size // self.record_size
|
| |
+ m = int(math.ceil(-n*math.log10(ERROR_RATE) /
|
| |
+ math.pow(math.log10(2), 2)))
|
| |
+ - m = (m/8 + 1)*8
|
| |
+ + m = (m//8 + 1)*8
|
| |
+ inmem = mmap.mmap(self.infile.fileno(),
|
| |
+ size,
|
| |
+ access=mmap.ACCESS_READ)
|
| |
+ - outmem = bytearray(m/8)
|
| |
+ - for i in xrange(0, n):
|
| |
+ + outmem = bytearray(m//8)
|
| |
+ + for i in range(0, n):
|
| |
+ offset = i*self.record_size
|
| |
+ b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
|
| |
+ - for k in xrange(0, 4):
|
| |
+ + for k in range(0, 4):
|
| |
+ h = murmur_hash3_32(b0, b1, k)
|
| |
+ h = int(h * (m / float(0xFFFFFFFF)))
|
| |
+ - outmem[h/8] |= (1 << (h%8))
|
| |
+ + outmem[h//8] |= (1 << (h%8))
|
| |
+ inmem.close()
|
| |
+ - # Convert bytearray to str, for Python 2.6 compatibility.
|
| |
+ - self.outfile.write(str(outmem))
|
| |
+ + # Convert bytearray to bytes, for Python 3 compatibility.
|
| |
+ + self.outfile.write(bytes(outmem))
|
| |
+
|
| |
+ if __name__ == '__main__':
|
| |
+ import sys
|
| |
+ @@ -110,7 +110,7 @@ if __name__ == '__main__':
|
| |
+ parser = argparse.ArgumentParser(description='filter')
|
| |
+ parser.add_argument('infile', type=argparse.FileType('r'),
|
| |
+ help='input file')
|
| |
+ - parser.add_argument('outfile', type=argparse.FileType('w'),
|
| |
+ + parser.add_argument('outfile', type=argparse.FileType('wb'),
|
| |
+ help='output file')
|
| |
+ parser.add_argument('record_size', type=int,
|
| |
+ help='record size')
|
| |
+ diff --git a/data/templates/libkkc-data/tools/sortlm.py b/data/templates/libkkc-data/tools/sortlm.py
|
| |
+ index a0dd8fe..40f0837 100644
|
| |
+ --- a/data/templates/libkkc-data/tools/sortlm.py
|
| |
+ +++ b/data/templates/libkkc-data/tools/sortlm.py
|
| |
+ @@ -40,10 +40,10 @@ class SortedGenerator(object):
|
| |
+ self.__min_cost = 0.0
|
| |
+
|
| |
+ def read(self):
|
| |
+ - print "reading N-grams"
|
| |
+ + print("reading N-grams")
|
| |
+ self.__read_tries()
|
| |
+ self.__read_ngrams()
|
| |
+ - print "min cost = %lf" % self.__min_cost
|
| |
+ + print("min cost = %lf" % self.__min_cost)
|
| |
+
|
| |
+ def __read_tries(self):
|
| |
+ while True:
|
| |
+ @@ -58,7 +58,7 @@ class SortedGenerator(object):
|
| |
+ line = self.__infile.readline()
|
| |
+ if line == "":
|
| |
+ break
|
| |
+ - line = line.strip()
|
| |
+ + line = line.strip('\n')
|
| |
+ if line == "":
|
| |
+ break
|
| |
+ match = self.__ngram_line_regex.match(line)
|
| |
+ @@ -89,7 +89,7 @@ class SortedGenerator(object):
|
| |
+ line = self.__infile.readline()
|
| |
+ if line == "":
|
| |
+ break
|
| |
+ - line = line.strip()
|
| |
+ + line = line.strip('\n')
|
| |
+ if line == "":
|
| |
+ break
|
| |
+ match = self.__ngram_line_regex.match(line)
|
| |
+ @@ -125,14 +125,11 @@ class SortedGenerator(object):
|
| |
+ def quantize(cost, min_cost):
|
| |
+ return max(0, min(65535, int(cost * 65535 / min_cost)))
|
| |
+
|
| |
+ - def cmp_header(a, b):
|
| |
+ - return cmp(a[0], b[0])
|
| |
+ -
|
| |
+ - print "writing 1-gram file"
|
| |
+ + print("writing 1-gram file")
|
| |
+ unigram_offsets = {}
|
| |
+ unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
|
| |
+ offset = 0
|
| |
+ - for ids, value in sorted(self.__ngram_entries[0].iteritems()):
|
| |
+ + for ids, value in sorted(self.__ngram_entries[0].items()):
|
| |
+ unigram_offsets[ids[0]] = offset
|
| |
+ s = struct.pack("=HHH",
|
| |
+ quantize(value[0], self.__min_cost),
|
| |
+ @@ -143,13 +140,13 @@ class SortedGenerator(object):
|
| |
+ offset += 1
|
| |
+ unigram_file.close()
|
| |
+
|
| |
+ - print "writing 2-gram file"
|
| |
+ + print("writing 2-gram file")
|
| |
+ bigram_offsets = {}
|
| |
+ bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
|
| |
+ keys = self.__ngram_entries[1].keys()
|
| |
+ items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
|
| |
+ offset = 0
|
| |
+ - for header, ids in sorted(items, cmp=cmp_header):
|
| |
+ + for header, ids in sorted(items, key=lambda x: x[0]):
|
| |
+ value = self.__ngram_entries[1][ids]
|
| |
+ bigram_offsets[ids] = offset
|
| |
+ s = struct.pack("=HH",
|
| |
+ @@ -160,11 +157,11 @@ class SortedGenerator(object):
|
| |
+ bigram_file.close()
|
| |
+
|
| |
+ if len(self.__ngram_entries[2]) > 0:
|
| |
+ - print "writing 3-gram file"
|
| |
+ + print("writing 3-gram file")
|
| |
+ trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
|
| |
+ keys = self.__ngram_entries[2].keys()
|
| |
+ items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
|
| |
+ - for header, ids in sorted(items, cmp=cmp_header):
|
| |
+ + for header, ids in sorted(items, key=lambda x: x[0]):
|
| |
+ value = self.__ngram_entries[2][ids]
|
| |
+ s = struct.pack("=H",
|
| |
+ quantize(value[0], self.__min_cost))
|
| |
diff --git a/libkkc/Makefile.am b/libkkc/Makefile.am
|
| |
index 02ca2ab..28a9f68 100644
|
| |
--- a/libkkc/Makefile.am
|
| |