Status: | Merged |
---|---|
Merged at revision: | 7392 |
Proposed branch: | lp:~user-none/calibre/dev |
Merge into: | lp:calibre |
Diff against target: |
253 lines (+112/-105) 2 files modified
src/calibre/ebooks/compression/tcr.py (+111/-99) src/calibre/ebooks/tcr/output.py (+1/-6) |
To merge this branch: | bzr merge lp:~user-none/calibre/dev |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Kovid Goyal | Pending | ||
Review via email: mp+44957@code.launchpad.net |
Commit message
Description of the change
TCR compression rewrite. Huge improvement in all aspects.
To post a comment you must log in.
lp:~user-none/calibre/dev
updated
- 7391. By Kovid Goyal
-
...
- 7392. By Kovid Goyal
-
TCR Output: Fix TCR compression adding junk to the end of the text. Remove compression level option.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'src/calibre/ebooks/compression/tcr.py' |
2 | --- src/calibre/ebooks/compression/tcr.py 2009-10-20 21:15:27 +0000 |
3 | +++ src/calibre/ebooks/compression/tcr.py 2010-12-31 14:03:53 +0000 |
4 | @@ -6,11 +6,118 @@ |
5 | |
6 | import re |
7 | |
8 | +class TCRCompressor(object): |
9 | + ''' |
10 | + TCR compression takes the form header+code_dict+coded_text. |
11 | + The header is always "!!8-Bit!!". The code dict is a list of 256 strings. |
12 | + The list takes the form 1 byte length and then a string. Each position in |
13 | + The list corresponds to a code found in the file. The coded text is |
14 | + string of characters values. for instance the character Q represents the |
15 | + value 81 which corresponds to the string in the code list at position 81. |
16 | + ''' |
17 | + |
18 | + def _reset(self): |
19 | + # List of indexes in the codes list that are empty and can hold new codes |
20 | + self.unused_codes = set() |
21 | + self.coded_txt = '' |
22 | + # Generate initial codes from text. |
23 | + # The index of the list will be the code that represents the characters at that location |
24 | + # in the list |
25 | + self.codes = [] |
26 | + |
27 | + def _combine_codes(self): |
28 | + ''' |
29 | + Combine two codes that always appear in pair into a single code. |
30 | + The intent is to create more unused codes. |
31 | + ''' |
32 | + possible_codes = [] |
33 | + a_code = set(re.findall('(?msu).', self.coded_txt)) |
34 | + |
35 | + for code in a_code: |
36 | + single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt)) |
37 | + if len(single_code) == 1: |
38 | + possible_codes.append(single_code.pop()) |
39 | + |
40 | + for code in possible_codes: |
41 | + self.coded_txt = self.coded_txt.replace(code, code[0]) |
42 | + self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])]) |
43 | + |
44 | + def _free_unused_codes(self): |
45 | + ''' |
46 | + Look for codes that do no not appear in the coded text and add them to |
47 | + the list of free codes. |
48 | + ''' |
49 | + for i in xrange(256): |
50 | + if i not in self.unused_codes: |
51 | + if chr(i) not in self.coded_txt: |
52 | + self.unused_codes.add(i) |
53 | + |
54 | + def _new_codes(self): |
55 | + ''' |
56 | + Create new codes from codes that occur in pairs often. |
57 | + ''' |
58 | + possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt))) |
59 | + new_codes_count = [] |
60 | + |
61 | + for c in possible_new_codes: |
62 | + count = self.coded_txt.count(c) |
63 | + # Less than 3 occurrences will not produce any size reduction. |
64 | + if count > 2: |
65 | + new_codes_count.append((c, count)) |
66 | + |
67 | + # Arrange the codes in order of least to most occurring. |
68 | + possible_new_codes = [x[0] for x in sorted(new_codes_count, key=lambda c: c[1])] |
69 | + |
70 | + return possible_new_codes |
71 | + |
72 | + def compress(self, txt): |
73 | + self._reset() |
74 | + |
75 | + self.codes = list(set(re.findall('(?msu).', txt))) |
76 | + |
77 | + # Replace the text with their corresponding code |
78 | + for c in txt: |
79 | + self.coded_txt += chr(self.codes.index(c)) |
80 | + |
81 | + # Zero the unused codes and record which are unused. |
82 | + for i in range(len(self.codes), 256): |
83 | + self.codes.append('') |
84 | + self.unused_codes.add(i) |
85 | + |
86 | + self._combine_codes() |
87 | + possible_codes = self._new_codes() |
88 | + |
89 | + while possible_codes and self.unused_codes: |
90 | + while possible_codes and self.unused_codes: |
91 | + unused_code = self.unused_codes.pop() |
92 | + # Take the last possible codes and split it into individual |
93 | + # codes. The last possible code is the most often occurring. |
94 | + code1, code2 = possible_codes.pop() |
95 | + self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) |
96 | + self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code)) |
97 | + self._combine_codes() |
98 | + self._free_unused_codes() |
99 | + possible_codes = self._new_codes() |
100 | + |
101 | + self._free_unused_codes() |
102 | + |
103 | + # Generate the code dictionary. |
104 | + code_dict = [] |
105 | + for i in xrange(0, 256): |
106 | + if i in self.unused_codes: |
107 | + code_dict.append(chr(0)) |
108 | + else: |
109 | + code_dict.append(chr(len(self.codes[i])) + self.codes[i]) |
110 | + |
111 | + # Join the identifier with the dictionary and coded text. |
112 | + return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt |
113 | + |
114 | + |
115 | def decompress(stream): |
116 | txt = [] |
117 | stream.seek(0) |
118 | if stream.read(9) != '!!8-Bit!!': |
119 | - raise ValueError('File %s contaions an invalid TCR header.' % stream.name) |
120 | + raise ValueError('File %s contains an invalid TCR header.' % stream.name) |
121 | |
122 | # Codes that the file contents are broken down into. |
123 | entries = [] |
124 | @@ -26,101 +133,6 @@ |
125 | |
126 | return ''.join(txt) |
127 | |
128 | - |
129 | -def compress(txt, level=5): |
130 | - ''' |
131 | - TCR compression takes the form header+code_list+coded_text. |
132 | - The header is always "!!8-Bit!!". The code list is a list of 256 strings. |
133 | - The list takes the form 1 byte length and then a string. Each position in |
134 | - The list corresponds to a code found in the file. The coded text is |
135 | - string of characters vaules. for instance the character Q represents the |
136 | - value 81 which corresponds to the string in the code list at position 81. |
137 | - ''' |
138 | - # Turn each unique character into a coded value. |
139 | - # The code of the string at a given position are represented by the position |
140 | - # they occupy in the list. |
141 | - codes = list(set(re.findall('(?msu).', txt))) |
142 | - for i in range(len(codes), 256): |
143 | - codes.append('') |
144 | - # Set the compression level. |
145 | - if level <= 1: |
146 | - new_length = 256 |
147 | - if level >= 10: |
148 | - new_length = 1 |
149 | - else: |
150 | - new_length = int(256 * (10 - level) * .1) |
151 | - new_length = 1 if new_length < 1 else new_length |
152 | - # Replace txt with codes. |
153 | - coded_txt = '' |
154 | - for c in txt: |
155 | - coded_txt += chr(codes.index(c)) |
156 | - txt = coded_txt |
157 | - # Start compressing the text. |
158 | - new = True |
159 | - merged = True |
160 | - while new or merged: |
161 | - # Merge codes that always follow another code |
162 | - merge = [] |
163 | - merged = False |
164 | - for i in xrange(256): |
165 | - if codes[i] != '': |
166 | - # Find all codes that are next to i. |
167 | - fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt))) |
168 | - # 1 if only one code comes after i. |
169 | - if len(fall) == 1: |
170 | - # We are searching codes and each code is always 1 character. |
171 | - j = ord(fall[0][1:2]) |
172 | - # Only merge if the total length of the string represented by |
173 | - # code is less than 256. |
174 | - if len(codes[i]) + len(codes[j]) < 256: |
175 | - merge.append((i, j)) |
176 | - if merge: |
177 | - merged = True |
178 | - for i, j in merge: |
179 | - # Merge the string for j into the string for i. |
180 | - if i == j: |
181 | - # Don't use += here just in case something goes wrong. This |
182 | - # will prevent out of control memory consumption. This is |
183 | - # unecessary but when creating this routine it happened due |
184 | - # to an error. |
185 | - codes[i] = codes[i] + codes[i] |
186 | - else: |
187 | - codes[i] = codes[i] + codes[j] |
188 | - txt = txt.replace(chr(i)+chr(j), chr(i)) |
189 | - if chr(j) not in txt: |
190 | - codes[j] = '' |
191 | - new = False |
192 | - if '' in codes: |
193 | - # Create a list of codes based on combinations of codes that are next |
194 | - # to each other. The amount of savings for the new code is calculated. |
195 | - new_codes = [] |
196 | - for c in list(set(re.findall('(?msu)..', txt))): |
197 | - i = ord(c[0:1]) |
198 | - j = ord(c[1:2]) |
199 | - if codes[i]+codes[j] in codes: |
200 | - continue |
201 | - savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j]) |
202 | - if savings > 2 and len(codes[i]) + len(codes[j]) < 256: |
203 | - new_codes.append((savings, i, j, codes[i], codes[j])) |
204 | - if new_codes: |
205 | - new = True |
206 | - # Sort the codes from highest savings to lowest. |
207 | - new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0) |
208 | - # The shorter new_length the more chances time merging will happen |
209 | - # giving more changes for better codes to be created. However, |
210 | - # the shorter new_lengh the longer it will take to compress. |
211 | - new_codes = new_codes[:new_length] |
212 | - for code in new_codes: |
213 | - if '' not in codes: |
214 | - break |
215 | - c = codes.index('') |
216 | - codes[c] = code[3]+code[4] |
217 | - txt = txt.replace(chr(code[1])+chr(code[2]), chr(c)) |
218 | - # Generate the code dictionary. |
219 | - header = [] |
220 | - for code in codes: |
221 | - header.append(chr(len(code))+code) |
222 | - for i in xrange(len(header), 256): |
223 | - header.append(chr(0)) |
224 | - # Join the identifier with the dictionary and coded text. |
225 | - return '!!8-Bit!!'+''.join(header)+txt |
226 | +def compress(txt): |
227 | + t = TCRCompressor() |
228 | + return t.compress(txt) |
229 | |
230 | === modified file 'src/calibre/ebooks/tcr/output.py' |
231 | --- src/calibre/ebooks/tcr/output.py 2009-10-23 11:58:25 +0000 |
232 | +++ src/calibre/ebooks/tcr/output.py 2010-12-31 14:03:53 +0000 |
233 | @@ -22,11 +22,6 @@ |
234 | level=OptionRecommendation.LOW, |
235 | help=_('Specify the character encoding of the output document. ' \ |
236 | 'The default is utf-8.')), |
237 | - OptionRecommendation(name='compression_level', recommended_value=5, |
238 | - level=OptionRecommendation.LOW, |
239 | - help=_('Specify the compression level to use. Scale 1 - 10. 1 ' \ |
240 | - 'being the lowest compression but the fastest and 10 being the ' \ |
241 | - 'highest compression but the slowest.')), |
242 | ]) |
243 | |
244 | def convert(self, oeb_book, output_path, input_plugin, opts, log): |
245 | @@ -48,7 +43,7 @@ |
246 | txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') |
247 | |
248 | log.info('Compressing text...') |
249 | - txt = compress(txt, opts.compression_level) |
250 | + txt = compress(txt) |
251 | |
252 | out_stream.seek(0) |
253 | out_stream.truncate() |