calibre

Merge lp:~user-none/calibre/dev into lp:calibre

dev
Merge into trunk

Proposed by John Schember on 2010-12-31

Status:	Merged
Merged at revision:	7392
Proposed branch:	lp:~user-none/calibre/dev
Merge into:	lp:calibre
Diff against target:	253 lines (+112/-105) 2 files modified src/calibre/ebooks/compression/tcr.py (+111/-99) src/calibre/ebooks/tcr/output.py (+1/-6)
To merge this branch:	bzr merge lp:~user-none/calibre/dev
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Kovid Goyal		2010-12-31	Pending
Review via email: mp+44957@code.launchpad.net

Description of the change

TCR compression rewrite. Huge improvement in all aspects.

lp:~user-none/calibre/dev updated on 2010-12-31

7391. By Kovid Goyal on 2010-12-31: ...
7392. By Kovid Goyal on 2010-12-31: TCR Output: Fix TCR compression adding junk to the end of the text. Remove compression level option.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Ali Baba

Kovid Goyal

Pankaj

Timothy Legge

gstoychev

to status/vote changes:

John Schember

 === modified file 'src/calibre/ebooks/compression/tcr.py'
 --- src/calibre/ebooks/compression/tcr.py	2009-10-20 21:15:27 +0000
 +++ src/calibre/ebooks/compression/tcr.py	2010-12-31 14:03:53 +0000
@@ -6,11 +6,118 @@
  import re
++class TCRCompressor(object):
++    '''
++    TCR compression takes the form header+code_dict+coded_text.
++    The header is always "!!8-Bit!!". The code dict is a list of 256 strings.
++    The list takes the form 1 byte length and then a string. Each position in
++    The list corresponds to a code found in the file. The coded text is
++    string of characters values. for instance the character Q represents the
++    value 81 which corresponds to the string in the code list at position 81.
++    '''
++
++    def _reset(self):
++        # List of indexes in the codes list that are empty and can hold new codes
++        self.unused_codes = set()
++        self.coded_txt = ''
++        # Generate initial codes from text.
++        # The index of the list will be the code that represents the characters at that location
++        # in the list
++        self.codes = []
++
++    def _combine_codes(self):
++        '''
++        Combine two codes that always appear in pair into a single code.
++        The intent is to create more unused codes.
++        '''
++        possible_codes = []
++        a_code = set(re.findall('(?msu).', self.coded_txt))
++
++        for code in a_code:
++            single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
++            if len(single_code) == 1:
++                possible_codes.append(single_code.pop())
++
++        for code in possible_codes:
++            self.coded_txt = self.coded_txt.replace(code, code[0])
++            self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
++
++    def _free_unused_codes(self):
++        '''
++        Look for codes that do no not appear in the coded text and add them to
++        the list of free codes.
++        '''
++        for i in xrange(256):
++            if i not in self.unused_codes:
++                if chr(i) not in self.coded_txt:
++                    self.unused_codes.add(i)
++
++    def _new_codes(self):
++        '''
++        Create new codes from codes that occur in pairs often.
++        '''
++        possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
++        new_codes_count = []
++
++        for c in possible_new_codes:
++            count = self.coded_txt.count(c)
++            # Less than 3 occurrences will not produce any size reduction.
++            if count > 2:
++                new_codes_count.append((c, count))
++
++        # Arrange the codes in order of least to most occurring.
++        possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])]
++
++        return possible_new_codes
++
++    def compress(self, txt):
++        self._reset()
++
++        self.codes = list(set(re.findall('(?msu).', txt)))
++
++        # Replace the text with their corresponding code
++        for c in txt:
++            self.coded_txt += chr(self.codes.index(c))
++
++        # Zero the unused codes and record which are unused.
++        for i in range(len(self.codes), 256):
++            self.codes.append('')
++            self.unused_codes.add(i)
++
++        self._combine_codes()
++        possible_codes = self._new_codes()
++
++        while possible_codes and self.unused_codes:
++            while possible_codes and self.unused_codes:
++                unused_code = self.unused_codes.pop()
++                # Take the last possible codes and split it into individual
++                # codes. The last possible code is the most often occurring.
++                code1, code2 = possible_codes.pop()
++                self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
++                self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
++            self._combine_codes()
++            self._free_unused_codes()
++            possible_codes = self._new_codes()
++
++        self._free_unused_codes()
++
++        # Generate the code dictionary.
++        code_dict = []
++        for i in xrange(0, 256):
++            if i in self.unused_codes:
++                code_dict.append(chr(0))
++            else:
++                code_dict.append(chr(len(self.codes[i])) + self.codes[i])
++
++        # Join the identifier with the dictionary and coded text.
++        return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
++
++
  def decompress(stream):
          txt = []
          stream.seek(0)
          if stream.read(9) != '!!8-Bit!!':
--            raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
++            raise ValueError('File %s contains an invalid TCR header.' % stream.name)
          # Codes that the file contents are broken down into.
          entries = []
@@ -26,101 +133,6 @@
          return ''.join(txt)
--
--def compress(txt, level=5):
--    '''
--    TCR compression takes the form header+code_list+coded_text.
--    The header is always "!!8-Bit!!". The code list is a list of 256 strings.
--    The list takes the form 1 byte length and then a string. Each position in
--    The list corresponds to a code found in the file. The coded text is
--    string of characters vaules. for instance the character Q represents the
--    value 81 which corresponds to the string in the code list at position 81.
--    '''
--    # Turn each unique character into a coded value.
--    # The code of the string at a given position are represented by the position
--    # they occupy in the list.
--    codes = list(set(re.findall('(?msu).', txt)))
--    for i in range(len(codes), 256):
--        codes.append('')
--    # Set the compression level.
--    if level <= 1:
--        new_length = 256
--    if level >= 10:
--        new_length = 1
--    else:
--        new_length = int(256 * (10 - level) * .1)
--    new_length = 1 if new_length < 1 else new_length
--    # Replace txt with codes.
--    coded_txt = ''
--    for c in txt:
--        coded_txt += chr(codes.index(c))
--    txt = coded_txt
--    # Start compressing the text.
--    new = True
--    merged = True
--    while new or merged:
--        # Merge codes that always follow another code
--        merge = []
--        merged = False
--        for i in xrange(256):
--            if codes[i] != '':
--                # Find all codes that are next to i.
--                fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
--                # 1 if only one code comes after i.
--                if len(fall) == 1:
--                    # We are searching codes and each code is always 1 character.
--                    j = ord(fall[0][1:2])
--                    # Only merge if the total length of the string represented by
--                    # code is less than 256.
--                    if len(codes[i]) + len(codes[j]) < 256:
--                        merge.append((i, j))
--        if merge:
--            merged = True
--            for i, j in merge:
--                # Merge the string for j into the string for i.
--                if i == j:
--                    # Don't use += here just in case something goes wrong. This
--                    # will prevent out of control memory consumption. This is
--                    # unecessary but when creating this routine it happened due
--                    # to an error.
--                    codes[i] = codes[i] + codes[i]
--                else:
--                    codes[i] = codes[i] + codes[j]
--                txt = txt.replace(chr(i)+chr(j), chr(i))
--                if chr(j) not in txt:
--                    codes[j] = ''
--        new = False
--        if '' in codes:
--            # Create a list of codes based on combinations of codes that are next
--            # to each other. The amount of savings for the new code is calculated.
--            new_codes = []
--            for c in list(set(re.findall('(?msu)..', txt))):
--                i = ord(c[0:1])
--                j = ord(c[1:2])
--                if codes[i]+codes[j] in codes:
--                    continue
--                savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
--                if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
--                    new_codes.append((savings, i, j, codes[i], codes[j]))
--            if new_codes:
--                new = True
--                # Sort the codes from highest savings to lowest.
--                new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
--                # The shorter new_length the more chances time merging will happen
--                # giving more changes for better codes to be created. However,
--                # the shorter new_lengh the longer it will take to compress.
--                new_codes = new_codes[:new_length]
--                for code in new_codes:
--                    if '' not in codes:
--                        break
--                    c = codes.index('')
--                    codes[c] = code[3]+code[4]
--                    txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
--    # Generate the code dictionary.
--    header = []
--    for code in codes:
--        header.append(chr(len(code))+code)
--    for i in xrange(len(header), 256):
--        header.append(chr(0))
--    # Join the identifier with the dictionary and coded text.
--    return '!!8-Bit!!'+''.join(header)+txt
++def compress(txt):
++    t = TCRCompressor()
++    return t.compress(txt)
 === modified file 'src/calibre/ebooks/tcr/output.py'
 --- src/calibre/ebooks/tcr/output.py	2009-10-23 11:58:25 +0000
 +++ src/calibre/ebooks/tcr/output.py	2010-12-31 14:03:53 +0000
@@ -22,11 +22,6 @@
              level=OptionRecommendation.LOW,
              help=_('Specify the character encoding of the output document. ' \
              'The default is utf-8.')),
--        OptionRecommendation(name='compression_level', recommended_value=5,
--            level=OptionRecommendation.LOW,
--            help=_('Specify the compression level to use. Scale 1 - 10. 1 ' \
--            'being the lowest compression but the fastest and 10 being the ' \
--            'highest compression but the slowest.')),
      ])
      def convert(self, oeb_book, output_path, input_plugin, opts, log):
@@ -48,7 +43,7 @@
          txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
          log.info('Compressing text...')
--        txt = compress(txt, opts.compression_level)
++        txt = compress(txt)
          out_stream.seek(0)
          out_stream.truncate()

calibre

Merge lp:~user-none/calibre/dev into lp:calibre

Commit message

Description of the change

Preview Diff

Subscribers