Merge lp:~user-none/calibre/dev into lp:calibre

Proposed by John Schember
Status: Merged
Merged at revision: 7392
Proposed branch: lp:~user-none/calibre/dev
Merge into: lp:calibre
Diff against target: 253 lines (+112/-105)
2 files modified
src/calibre/ebooks/compression/tcr.py (+111/-99)
src/calibre/ebooks/tcr/output.py (+1/-6)
To merge this branch: bzr merge lp:~user-none/calibre/dev
Reviewer Review Type Date Requested Status
Kovid Goyal Pending
Review via email: mp+44957@code.launchpad.net

Description of the change

TCR compression rewrite. Huge improvement in all aspects.

To post a comment you must log in.
lp:~user-none/calibre/dev updated
7391. By Kovid Goyal

...

7392. By Kovid Goyal

TCR Output: Fix TCR compression adding junk to the end of the text. Remove compression level option.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/calibre/ebooks/compression/tcr.py'
2--- src/calibre/ebooks/compression/tcr.py 2009-10-20 21:15:27 +0000
3+++ src/calibre/ebooks/compression/tcr.py 2010-12-31 14:03:53 +0000
4@@ -6,11 +6,118 @@
5
6 import re
7
8+class TCRCompressor(object):
9+ '''
10+ TCR compression takes the form header+code_dict+coded_text.
11+ The header is always "!!8-Bit!!". The code dict is a list of 256 strings.
12+ The list takes the form 1 byte length and then a string. Each position in
13+ The list corresponds to a code found in the file. The coded text is
14+ string of characters values. for instance the character Q represents the
15+ value 81 which corresponds to the string in the code list at position 81.
16+ '''
17+
18+ def _reset(self):
19+ # List of indexes in the codes list that are empty and can hold new codes
20+ self.unused_codes = set()
21+ self.coded_txt = ''
22+ # Generate initial codes from text.
23+ # The index of the list will be the code that represents the characters at that location
24+ # in the list
25+ self.codes = []
26+
27+ def _combine_codes(self):
28+ '''
29+ Combine two codes that always appear in pair into a single code.
30+ The intent is to create more unused codes.
31+ '''
32+ possible_codes = []
33+ a_code = set(re.findall('(?msu).', self.coded_txt))
34+
35+ for code in a_code:
36+ single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
37+ if len(single_code) == 1:
38+ possible_codes.append(single_code.pop())
39+
40+ for code in possible_codes:
41+ self.coded_txt = self.coded_txt.replace(code, code[0])
42+ self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
43+
44+ def _free_unused_codes(self):
45+ '''
46+ Look for codes that do no not appear in the coded text and add them to
47+ the list of free codes.
48+ '''
49+ for i in xrange(256):
50+ if i not in self.unused_codes:
51+ if chr(i) not in self.coded_txt:
52+ self.unused_codes.add(i)
53+
54+ def _new_codes(self):
55+ '''
56+ Create new codes from codes that occur in pairs often.
57+ '''
58+ possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
59+ new_codes_count = []
60+
61+ for c in possible_new_codes:
62+ count = self.coded_txt.count(c)
63+ # Less than 3 occurrences will not produce any size reduction.
64+ if count > 2:
65+ new_codes_count.append((c, count))
66+
67+ # Arrange the codes in order of least to most occurring.
68+ possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])]
69+
70+ return possible_new_codes
71+
72+ def compress(self, txt):
73+ self._reset()
74+
75+ self.codes = list(set(re.findall('(?msu).', txt)))
76+
77+ # Replace the text with their corresponding code
78+ for c in txt:
79+ self.coded_txt += chr(self.codes.index(c))
80+
81+ # Zero the unused codes and record which are unused.
82+ for i in range(len(self.codes), 256):
83+ self.codes.append('')
84+ self.unused_codes.add(i)
85+
86+ self._combine_codes()
87+ possible_codes = self._new_codes()
88+
89+ while possible_codes and self.unused_codes:
90+ while possible_codes and self.unused_codes:
91+ unused_code = self.unused_codes.pop()
92+ # Take the last possible codes and split it into individual
93+ # codes. The last possible code is the most often occurring.
94+ code1, code2 = possible_codes.pop()
95+ self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
96+ self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
97+ self._combine_codes()
98+ self._free_unused_codes()
99+ possible_codes = self._new_codes()
100+
101+ self._free_unused_codes()
102+
103+ # Generate the code dictionary.
104+ code_dict = []
105+ for i in xrange(0, 256):
106+ if i in self.unused_codes:
107+ code_dict.append(chr(0))
108+ else:
109+ code_dict.append(chr(len(self.codes[i])) + self.codes[i])
110+
111+ # Join the identifier with the dictionary and coded text.
112+ return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
113+
114+
115 def decompress(stream):
116 txt = []
117 stream.seek(0)
118 if stream.read(9) != '!!8-Bit!!':
119- raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
120+ raise ValueError('File %s contains an invalid TCR header.' % stream.name)
121
122 # Codes that the file contents are broken down into.
123 entries = []
124@@ -26,101 +133,6 @@
125
126 return ''.join(txt)
127
128-
129-def compress(txt, level=5):
130- '''
131- TCR compression takes the form header+code_list+coded_text.
132- The header is always "!!8-Bit!!". The code list is a list of 256 strings.
133- The list takes the form 1 byte length and then a string. Each position in
134- The list corresponds to a code found in the file. The coded text is
135- string of characters vaules. for instance the character Q represents the
136- value 81 which corresponds to the string in the code list at position 81.
137- '''
138- # Turn each unique character into a coded value.
139- # The code of the string at a given position are represented by the position
140- # they occupy in the list.
141- codes = list(set(re.findall('(?msu).', txt)))
142- for i in range(len(codes), 256):
143- codes.append('')
144- # Set the compression level.
145- if level <= 1:
146- new_length = 256
147- if level >= 10:
148- new_length = 1
149- else:
150- new_length = int(256 * (10 - level) * .1)
151- new_length = 1 if new_length < 1 else new_length
152- # Replace txt with codes.
153- coded_txt = ''
154- for c in txt:
155- coded_txt += chr(codes.index(c))
156- txt = coded_txt
157- # Start compressing the text.
158- new = True
159- merged = True
160- while new or merged:
161- # Merge codes that always follow another code
162- merge = []
163- merged = False
164- for i in xrange(256):
165- if codes[i] != '':
166- # Find all codes that are next to i.
167- fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
168- # 1 if only one code comes after i.
169- if len(fall) == 1:
170- # We are searching codes and each code is always 1 character.
171- j = ord(fall[0][1:2])
172- # Only merge if the total length of the string represented by
173- # code is less than 256.
174- if len(codes[i]) + len(codes[j]) < 256:
175- merge.append((i, j))
176- if merge:
177- merged = True
178- for i, j in merge:
179- # Merge the string for j into the string for i.
180- if i == j:
181- # Don't use += here just in case something goes wrong. This
182- # will prevent out of control memory consumption. This is
183- # unecessary but when creating this routine it happened due
184- # to an error.
185- codes[i] = codes[i] + codes[i]
186- else:
187- codes[i] = codes[i] + codes[j]
188- txt = txt.replace(chr(i)+chr(j), chr(i))
189- if chr(j) not in txt:
190- codes[j] = ''
191- new = False
192- if '' in codes:
193- # Create a list of codes based on combinations of codes that are next
194- # to each other. The amount of savings for the new code is calculated.
195- new_codes = []
196- for c in list(set(re.findall('(?msu)..', txt))):
197- i = ord(c[0:1])
198- j = ord(c[1:2])
199- if codes[i]+codes[j] in codes:
200- continue
201- savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
202- if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
203- new_codes.append((savings, i, j, codes[i], codes[j]))
204- if new_codes:
205- new = True
206- # Sort the codes from highest savings to lowest.
207- new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
208- # The shorter new_length the more chances time merging will happen
209- # giving more changes for better codes to be created. However,
210- # the shorter new_lengh the longer it will take to compress.
211- new_codes = new_codes[:new_length]
212- for code in new_codes:
213- if '' not in codes:
214- break
215- c = codes.index('')
216- codes[c] = code[3]+code[4]
217- txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
218- # Generate the code dictionary.
219- header = []
220- for code in codes:
221- header.append(chr(len(code))+code)
222- for i in xrange(len(header), 256):
223- header.append(chr(0))
224- # Join the identifier with the dictionary and coded text.
225- return '!!8-Bit!!'+''.join(header)+txt
226+def compress(txt):
227+ t = TCRCompressor()
228+ return t.compress(txt)
229
230=== modified file 'src/calibre/ebooks/tcr/output.py'
231--- src/calibre/ebooks/tcr/output.py 2009-10-23 11:58:25 +0000
232+++ src/calibre/ebooks/tcr/output.py 2010-12-31 14:03:53 +0000
233@@ -22,11 +22,6 @@
234 level=OptionRecommendation.LOW,
235 help=_('Specify the character encoding of the output document. ' \
236 'The default is utf-8.')),
237- OptionRecommendation(name='compression_level', recommended_value=5,
238- level=OptionRecommendation.LOW,
239- help=_('Specify the compression level to use. Scale 1 - 10. 1 ' \
240- 'being the lowest compression but the fastest and 10 being the ' \
241- 'highest compression but the slowest.')),
242 ])
243
244 def convert(self, oeb_book, output_path, input_plugin, opts, log):
245@@ -48,7 +43,7 @@
246 txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
247
248 log.info('Compressing text...')
249- txt = compress(txt, opts.compression_level)
250+ txt = compress(txt)
251
252 out_stream.seek(0)
253 out_stream.truncate()

Subscribers

People subscribed via source and target branches

to status/vote changes: