1
=== modified file 'src/calibre/ebooks/conversion/preprocess.py'
2
--- src/calibre/ebooks/conversion/preprocess.py	2012-04-13 15:23:43 +0000
3
+++ src/calibre/ebooks/conversion/preprocess.py	2012-04-20 17:04:32 +0000
4
@@ -559,7 +559,7 @@
5
559
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
559
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
6
560
                end_rules.append(
560
                end_rules.append(
7
561
                    # Un wrap using punctuation
561
                    # Un wrap using punctuation
9
562
                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
562
                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
10
563
                )
563
                )
11
564
564
12
565
        for rule in self.PREPROCESS + start_rules:
565
        for rule in self.PREPROCESS + start_rules:
13
566
566
14
=== modified file 'src/calibre/ebooks/conversion/utils.py'
15
--- src/calibre/ebooks/conversion/utils.py	2012-04-20 13:52:57 +0000
16
+++ src/calibre/ebooks/conversion/utils.py	2012-04-20 17:04:32 +0000
17
@@ -316,10 +316,18 @@
18
316
        '''
316
        '''
19
317
        Unwraps lines based on line length and punctuation
317
        Unwraps lines based on line length and punctuation
20
318
        supports a range of html markup and text files
318
        supports a range of html markup and text files
21
319
        
22
320
        the lookahead regex below is meant look for any non-full stop characters - punctuation
23
321
        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
24
322
        the reason for this is to prevent false positive wrapping.  False positives are more
25
323
        difficult to detect than false negatives during a manual review of the doc
26
324
        
27
325
        This function intentionally leaves hyphenated content alone as that is handled by the 
28
326
        dehyphenate routine in a separate step
29
319
        '''
327
        '''
30
328
31
320
        # define the pieces of the regex
329
        # define the pieces of the regex
34
321
330
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
33
322
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
35
323
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
331
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
36
324
        soft_hyphen = u"\xad"
332
        soft_hyphen = u"\xad"
37
325
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
333
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
Status:	Merged
Merged at revision:	11963
Proposed branch:	lp:~ldolse/calibre/heuristics
Merge into:	lp:calibre
Diff against target:	37 lines (+11/-3) 2 files modified src/calibre/ebooks/conversion/preprocess.py (+1/-1) src/calibre/ebooks/conversion/utils.py (+10/-2)
To merge this branch:	bzr merge lp:~ldolse/calibre/heuristics
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Kovid Goyal		2012-04-20	Pending
Review via email: mp+102903@code.launchpad.net
1	=== modified file 'src/calibre/ebooks/conversion/preprocess.py'
2	--- src/calibre/ebooks/conversion/preprocess.py 2012-04-13 15:23:43 +0000
3	+++ src/calibre/ebooks/conversion/preprocess.py 2012-04-20 17:04:32 +0000
4	@@ -559,7 +559,7 @@
5	559	end_rules.append((re.compile(u'(?<=.{%i}[–—])\s<p>\s(?=[[a-z\d])' % length), lambda match: ''))	559	end_rules.append((re.compile(u'(?<=.{%i}[–—])\s<p>\s(?=[[a-z\d])' % length), lambda match: ''))
6	560	end_rules.append(	560	end_rules.append(
7	561	# Un wrap using punctuation	561	# Un wrap using punctuation
9	562	(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]\|(?<!\&\w{4});))\s(?P<ital></(i\|b\|u)>)?\s(</p>\s<p>\s)+\s(?=(<(i\|b\|u)>)?\s[\w\d$(])' % length, re.UNICODE), wrap_lines),	562	(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]\|(?<!\&\w{4});))\s(?P<ital></(i\|b\|u)>)?\s(</p>\s<p>\s)+\s(?=(<(i\|b\|u)>)?\s[\w\d$(])' % length, re.UNICODE), wrap_lines),
10	563	)	563	)
11	564		564
12	565	for rule in self.PREPROCESS + start_rules:	565	for rule in self.PREPROCESS + start_rules:
13	566		566
14	=== modified file 'src/calibre/ebooks/conversion/utils.py'
15	--- src/calibre/ebooks/conversion/utils.py 2012-04-20 13:52:57 +0000
16	+++ src/calibre/ebooks/conversion/utils.py 2012-04-20 17:04:32 +0000
17	@@ -316,10 +316,18 @@
18	316	'''	316	'''
19	317	Unwraps lines based on line length and punctuation	317	Unwraps lines based on line length and punctuation
20	318	supports a range of html markup and text files	318	supports a range of html markup and text files
21			319
22			320	the lookahead regex below is meant look for any non-full stop characters - punctuation
23			321	characters which can be used as a full stop should not be added below - e.g. ?!“”. etc
24			322	the reason for this is to prevent false positive wrapping. False positives are more
25			323	difficult to detect than false negatives during a manual review of the doc
26			324
27			325	This function intentionally leaves hyphenated content alone as that is handled by the
28			326	dehyphenate routine in a separate step
29	319	'''	327	'''
30			328
31	320	# define the pieces of the regex	329	# define the pieces of the regex
34	321		330	lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]\|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
33	322	lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]\|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
35	323	em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"	331	em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
36	324	soft_hyphen = u"\xad"	332	soft_hyphen = u"\xad"
37	325	line_ending = "\s</(span\|[iubp]\|div)>\s(</(span\|[iubp]\|div)>)?"	333	line_ending = "\s</(span\|[iubp]\|div)>\s(</(span\|[iubp]\|div)>)?"