efb3245
From ba1c1bd3eb86d887fc3689c3142732658071b5f7 Mon Sep 17 00:00:00 2001
efb3245
From: Takao Fujiwara <tfujiwar@redhat.com>
efb3245
Date: Mon, 30 Jul 2018 15:26:37 +0900
efb3245
Subject: [PATCH] build: Enable python3
efb3245
efb3245
---
efb3245
 data/templates/libkkc-data/tools/genfilter.py | 18 +++++++--------
efb3245
 data/templates/libkkc-data/tools/sortlm.py    | 23 ++++++++-----------
efb3245
 2 files changed, 19 insertions(+), 22 deletions(-)
efb3245
efb3245
diff --git a/data/templates/libkkc-data/tools/genfilter.py b/data/templates/libkkc-data/tools/genfilter.py
efb3245
index 5ffab32..0c5f75a 100644
efb3245
--- a/data/templates/libkkc-data/tools/genfilter.py
efb3245
+++ b/data/templates/libkkc-data/tools/genfilter.py
efb3245
@@ -84,24 +84,24 @@ def __init__(self, infile, outfile, record_size):
efb3245
 
efb3245
     def generate(self):
efb3245
         size = os.fstat(self.infile.fileno()).st_size
efb3245
-        n = size / self.record_size
efb3245
+        n = size // self.record_size
efb3245
         m = int(math.ceil(-n*math.log10(ERROR_RATE) /
efb3245
                           math.pow(math.log10(2), 2)))
efb3245
-        m = (m/8 + 1)*8
efb3245
+        m = (m//8 + 1)*8
efb3245
         inmem = mmap.mmap(self.infile.fileno(),
efb3245
                           size,
efb3245
                           access=mmap.ACCESS_READ)
efb3245
-        outmem = bytearray(m/8)
efb3245
-        for i in xrange(0, n):
efb3245
+        outmem = bytearray(m//8)
efb3245
+        for i in range(0, n):
efb3245
             offset = i*self.record_size
efb3245
             b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
efb3245
-            for k in xrange(0, 4):
efb3245
+            for k in range(0, 4):
efb3245
                 h = murmur_hash3_32(b0, b1, k)
efb3245
                 h = int(h * (m / float(0xFFFFFFFF)))
efb3245
-                outmem[h/8] |= (1 << (h%8))
efb3245
+                outmem[h//8] |= (1 << (h%8))
efb3245
         inmem.close()
efb3245
-        # Convert bytearray to str, for Python 2.6 compatibility.
efb3245
-        self.outfile.write(str(outmem))
efb3245
+        # Convert bytearray to bytes, for Python 3 compatibility.
efb3245
+        self.outfile.write(bytes(outmem))
efb3245
 
efb3245
 if __name__ == '__main__':
efb3245
     import sys
efb3245
@@ -110,7 +110,7 @@ def generate(self):
efb3245
     parser = argparse.ArgumentParser(description='filter')
efb3245
     parser.add_argument('infile', type=argparse.FileType('r'),
efb3245
                         help='input file')
efb3245
-    parser.add_argument('outfile', type=argparse.FileType('w'),
efb3245
+    parser.add_argument('outfile', type=argparse.FileType('wb'),
efb3245
                         help='output file')
efb3245
     parser.add_argument('record_size', type=int,
efb3245
                         help='record size')
efb3245
diff --git a/data/templates/libkkc-data/tools/sortlm.py b/data/templates/libkkc-data/tools/sortlm.py
efb3245
index a0dd8fe..40f0837 100644
efb3245
--- a/data/templates/libkkc-data/tools/sortlm.py
efb3245
+++ b/data/templates/libkkc-data/tools/sortlm.py
efb3245
@@ -40,10 +40,10 @@ def __init__(self, infile, output_prefix):
efb3245
         self.__min_cost = 0.0
efb3245
 
efb3245
     def read(self):
efb3245
-        print "reading N-grams"
efb3245
+        print("reading N-grams")
efb3245
         self.__read_tries()
efb3245
         self.__read_ngrams()
efb3245
-        print "min cost = %lf" % self.__min_cost
efb3245
+        print("min cost = %lf" % self.__min_cost)
efb3245
 
efb3245
     def __read_tries(self):
efb3245
         while True:
efb3245
@@ -58,7 +58,7 @@ def __read_tries(self):
efb3245
             line = self.__infile.readline()
efb3245
             if line == "":
efb3245
                 break
efb3245
-            line = line.strip()
efb3245
+            line = line.strip('\n')
efb3245
             if line == "":
efb3245
                 break
efb3245
             match = self.__ngram_line_regex.match(line)
efb3245
@@ -89,7 +89,7 @@ def __read_ngrams(self):
efb3245
                 line = self.__infile.readline()
efb3245
                 if line == "":
efb3245
                     break
efb3245
-                line = line.strip()
efb3245
+                line = line.strip('\n')
efb3245
                 if line == "":
efb3245
                     break
efb3245
                 match = self.__ngram_line_regex.match(line)
efb3245
@@ -125,14 +125,11 @@ def __write_ngrams(self):
efb3245
         def quantize(cost, min_cost):
efb3245
             return max(0, min(65535, int(cost * 65535 / min_cost)))
efb3245
 
efb3245
-        def cmp_header(a, b):
efb3245
-            return cmp(a[0], b[0])
efb3245
-
efb3245
-        print "writing 1-gram file"
efb3245
+        print("writing 1-gram file")
efb3245
         unigram_offsets = {}
efb3245
         unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
efb3245
         offset = 0
efb3245
-        for ids, value in sorted(self.__ngram_entries[0].iteritems()):
efb3245
+        for ids, value in sorted(self.__ngram_entries[0].items()):
efb3245
             unigram_offsets[ids[0]] = offset
efb3245
             s = struct.pack("=HHH",
efb3245
                             quantize(value[0], self.__min_cost),
efb3245
@@ -143,13 +140,13 @@ def cmp_header(a, b):
efb3245
             offset += 1
efb3245
         unigram_file.close()
efb3245
 
efb3245
-        print "writing 2-gram file"
efb3245
+        print("writing 2-gram file")
efb3245
         bigram_offsets = {}
efb3245
         bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
efb3245
         keys = self.__ngram_entries[1].keys()
efb3245
         items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
efb3245
         offset = 0
efb3245
-        for header, ids in sorted(items, cmp=cmp_header):
efb3245
+        for header, ids in sorted(items, key=lambda x: x[0]):
efb3245
             value = self.__ngram_entries[1][ids]
efb3245
             bigram_offsets[ids] = offset
efb3245
             s = struct.pack("=HH",
efb3245
@@ -160,11 +157,11 @@ def cmp_header(a, b):
efb3245
         bigram_file.close()
efb3245
 
efb3245
         if len(self.__ngram_entries[2]) > 0:
efb3245
-            print "writing 3-gram file"
efb3245
+            print("writing 3-gram file")
efb3245
             trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
efb3245
             keys = self.__ngram_entries[2].keys()
efb3245
             items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
efb3245
-            for header, ids in sorted(items, cmp=cmp_header):
efb3245
+            for header, ids in sorted(items, key=lambda x: x[0]):
efb3245
                 value = self.__ngram_entries[2][ids]
efb3245
                 s = struct.pack("=H",
efb3245
                                 quantize(value[0], self.__min_cost))