Merge lp:~dosage-dev/dosage/bunch-of-comics into lp:~dosage-dev/dosage/old

Proposed by Tristan Seligmann
Status: Merged
Approved by: Jonathan Jacobs
Approved revision: not available
Merged at revision: not available
Proposed branch: lp:~dosage-dev/dosage/bunch-of-comics
Merge into: lp:~dosage-dev/dosage/old
Diff against target: 377 lines (+172/-10)
11 files modified
.bzrignore (+1/-0)
dosage/plugins/a.py (+19/-3)
dosage/plugins/b.py (+18/-0)
dosage/plugins/c.py (+12/-1)
dosage/plugins/g.py (+11/-1)
dosage/plugins/h.py (+21/-0)
dosage/plugins/keenspot.py (+1/-0)
dosage/plugins/l.py (+18/-0)
dosage/plugins/w.py (+1/-0)
dosage/test/test_util.py (+29/-2)
dosage/util.py (+41/-3)
To merge this branch: bzr merge lp:~dosage-dev/dosage/bunch-of-comics
Reviewer Review Type Date Requested Status
Jonathan Jacobs Approve
Review via email: mp+16758@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Jonathan Jacobs (jjacobs) wrote :

  1. There are a number of coding style infractions:
    * Only 2 lines between top-level suites.
    * Lines (not regular expressions) longer than 80 columns.
  2. The regular expression for HateSong.prevSearch can probably be simplified to use exact lengths.
  3. Bellen.imageSearch has an odd regular expression, spaces seem like the kind of thing that would appear in a "src" attribute.
  4. Effbot has a reasonable HTML entity decoder implementation[1] that could be the start of a better normalizeUrl implementation.

[1] http://effbot.org/zone/re-sub.htm#unescape-html

review: Needs Fixing
616. By Tristan Seligmann

Different regex for Bellen.

617. By Tristan Seligmann

More comprehensive quoting thingy.

618. By Tristan Seligmann

Fix some coding style issues.

619. By Tristan Seligmann

Tweak regex.

Revision history for this message
Tristan Seligmann (mithrandi) wrote :

Okay, think I've fixed all of those.

Revision history for this message
Jonathan Jacobs (jjacobs) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file '.bzrignore'
2--- .bzrignore 1970-01-01 00:00:00 +0000
3+++ .bzrignore 2010-01-04 20:15:24 +0000
4@@ -0,0 +1,1 @@
5+dropin.cache
6
7=== modified file 'dosage/plugins/a.py'
8--- dosage/plugins/a.py 2009-12-15 06:55:27 +0000
9+++ dosage/plugins/a.py 2010-01-04 20:15:24 +0000
10@@ -1,6 +1,7 @@
11 from re import compile, MULTILINE
12
13-from dosage.helpers import BasicScraper, regexNamer, bounceStarter
14+from dosage.helpers import (
15+ BasicScraper, regexNamer, bounceStarter, indirectStarter)
16
17
18 class ALessonIsLearned(BasicScraper):
19@@ -67,6 +68,18 @@
20 help = 'Index format: nnn'
21
22
23+
24+class AnarchySD(BasicScraper):
25+ imageUrl = 'http://www.anarchycomic.com/page%s.php'
26+ imageSearch = compile(r'<img.+src="../(images/page\d+\..+?)"')
27+ prevSearch = compile(r'<a href="(page\d+\.php)">PREVIOUS PAGE')
28+ help = 'Index format: n (unpadded)'
29+ starter = indirectStarter(
30+ 'http://www.anarchycomic.com/page1.php',
31+ compile(r'<a href="(page\d+\.php)" class="style15">LATEST'))
32+
33+
34+
35 class Altermeta(BasicScraper):
36 latestUrl = 'http://www.altermeta.com/'
37 imageUrl = 'http://www.altermeta.com/index.php?PS=viewComic.php&comic=%s'
38@@ -125,11 +138,14 @@
39
40
41 class AstronomyPOTD(BasicScraper):
42- starter = bounceStarter('http://antwrp.gsfc.nasa.gov/apod/astropix.html', compile(r'<a href="(ap\d{6}\.html)">&gt;</a>'))
43+ starter = bounceStarter(
44+ 'http://antwrp.gsfc.nasa.gov/apod/astropix.html',
45+ compile(r'<a href="(ap\d{6}\.html)">&gt;</a>'))
46 imageUrl = 'http://antwrp.gsfc.nasa.gov/apod/ap%s.html'
47 imageSearch = compile(r'<a href="(image/\d{4}/.+\..+?)">')
48 prevSearch = compile(r'<a href="(ap\d{6}\.html)">&lt;</a>')
49 help = 'Index format: yymmdd'
50
51 def namer(cls, imageUrl, pageUrl):
52- return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:], imageUrl.split('/')[-1].split('.')[0])
53+ return '%s-%s' % (pageUrl.split('/')[-1].split('.')[0][2:],
54+ imageUrl.split('/')[-1].split('.')[0])
55
56=== modified file 'dosage/plugins/b.py'
57--- dosage/plugins/b.py 2009-12-15 06:55:27 +0000
58+++ dosage/plugins/b.py 2010-01-04 20:15:24 +0000
59@@ -169,3 +169,21 @@
60 starslipCrisis = blankLabel('StarslipCrisis', 'http://www.starslipcrisis.com/')
61 uglyHill = blankLabel('UglyHill', 'http://www.uglyhill.com/')
62 wapsiSquare = blankLabel('WapsiSquare', 'http://www.wapsisquare.com/')
63+
64+
65+
66+class BeePower(BasicScraper):
67+ latestUrl = 'http://comicswithoutviolence.com/d/20080713.html'
68+ imageUrl = 'http://comicswithoutviolence.com/d/%s.html'
69+ imageSearch = compile(r'src="(/comics/.+?)"')
70+ prevSearch = compile(r'(\d+\.html)"><img[^>]+?src="/images/previous_day.png"')
71+ help = 'Index format: yyyy/mm/dd'
72+
73+
74+
75+class Bellen(BasicScraper):
76+ latestUrl = 'http://boxbrown.com/'
77+ imageUrl = 'http://boxbrown.com/?p=%s'
78+ imageSearch = compile(r'<img src="(http://boxbrown.com/comics/[^"]+)"')
79+ prevSearch = compile(r'<a href="(.+?)"><span class="prev">')
80+ help = 'Index format: nnn'
81
82=== modified file 'dosage/plugins/c.py'
83--- dosage/plugins/c.py 2009-12-15 06:55:27 +0000
84+++ dosage/plugins/c.py 2010-01-04 20:15:24 +0000
85@@ -1,6 +1,7 @@
86 from re import compile
87
88-from dosage.helpers import BasicScraper, constStarter, bounceStarter, indirectStarter
89+from dosage.helpers import (
90+ BasicScraper, constStarter, bounceStarter, indirectStarter
91 from dosage.util import getQueryParams
92
93
94@@ -303,9 +304,19 @@
95 zhi = creators('ZackHill', 'zhi')
96
97
98+
99 class CyanideAndHappiness(BasicScraper):
100 latestUrl = 'http://www.explosm.net/comics'
101 imageUrl = 'http://www.explosm.net/comics/%s'
102 imageSearch = compile(r'<img alt="Cyanide and Happiness, a daily webcomic" src="(http:\/\/www\.explosm\.net/db/files/Comics/\w+/\S+\.\w+)"')
103 prevSearch = compile(r'<a href="(/comics/\d+/?)">< Previous</a>')
104 help = 'Index format: n (unpadded)'
105+
106+
107+
108+class CrimsonDark(BasicScraper):
109+ latestUrl = 'http://www.davidcsimon.com/crimsondark/'
110+ imageUrl = 'http://www.davidcsimon.com/crimsondark/index.php?view=comic&strip_id=%s'
111+ imageSearch = compile(r'src="(.+?strips/.+?)"')
112+ prevSearch = compile(r'<a href=[\'"](/crimsondark/index\.php\?view=comic&amp;strip_id=\d+)[\'"]><img src=[\'"]themes/cdtheme/images/active_prev.png[\'"]')
113+ help = 'Index format: n (unpadded)'
114
115=== modified file 'dosage/plugins/g.py'
116--- dosage/plugins/g.py 2009-12-15 06:55:27 +0000
117+++ dosage/plugins/g.py 2010-01-04 20:15:24 +0000
118@@ -83,9 +83,19 @@
119 help = 'Index format: n'
120
121
122-class GunnerkrigCourt(BasicScraper):
123+
124+class GunnerkrigCourt(BasicScraper):
125 latestUrl = 'http://www.gunnerkrigg.com/index2.php'
126 imageUrl = 'http://www.gunnerkrigg.com/archive_page.php\?comicID=%s'
127 imageSearch = compile(r'<img src="(.+?//comics/.+?)"')
128 prevSearch = compile(r'<.+?(/archive_page.php\?comicID=.+?)".+?prev')
129 help = 'Index format: n'
130+
131+
132+
133+class Gunshow(BasicScraper):
134+ latestUrl = 'http://gunshowcomic.com/'
135+ imageUrl = 'http://gunshowcomic.com/d/%s.html'
136+ imageSearch = compile(r'src="(/comics/.+?)"')
137+ prevSearch = compile(r'(/d/\d+\.html)"><img[^>]+?src="/images/previous_day')
138+ help = 'Index format: yyyy/mm/dd'
139
140=== modified file 'dosage/plugins/h.py'
141--- dosage/plugins/h.py 2009-12-15 06:55:27 +0000
142+++ dosage/plugins/h.py 2010-01-04 20:15:24 +0000
143@@ -3,6 +3,7 @@
144 from dosage.helpers import BasicScraper
145
146
147+
148 class HappyMedium(BasicScraper):
149 latestUrl = 'http://happymedium.fast-bee.com/'
150 imageUrl = 'http://happymedium.fast-bee.com/%s'
151@@ -11,6 +12,7 @@
152 help = 'Index format: yyyy/mm/chapter-n-page-n'
153
154
155+
156 class Heliothaumic(BasicScraper):
157 latestUrl = 'http://thaumic.net/'
158 imageUrl = 'http://thaumic.net/%s'
159@@ -19,9 +21,28 @@
160 help = 'Index format: yyyy/mm/dd/n(unpadded)-comicname'
161
162
163+
164 class Housd(BasicScraper):
165 latestUrl = 'http://www.housd.net/'
166 imageUrl = 'http://housd.net/archive_page.php?comicID=%s'
167 imageSearch = compile(r'"(.+?/comics/.+?)"')
168 prevSearch = compile(r'"(h.+?comicID=.+?)".+?prev')
169 help = 'Index format: nnnn'
170+
171+
172+
173+class HateSong(BasicScraper):
174+ latestUrl = 'http://hatesong.com/'
175+ imageUrl = 'http://hatesong.com/%s/'
176+ imageSearch = compile(r'src="(http://www.hatesong.com/strips/.+?)"')
177+ prevSearch = compile(r'<div class="headernav"><a href="(http://hatesong.com/\d{4}/\d{2}/\d{2})')
178+ help = 'Index format: yyyy/mm/dd'
179+
180+
181+
182+class HorribleVille(BasicScraper):
183+ latestUrl = 'http://horribleville.com/d/20090517.html'
184+ imageUrl = 'http://horribleville.com/d/%s.html'
185+ imageSearch = compile(r'src="(/comics/.+?)"')
186+ prevSearch = compile(r'(\d+\.html)"><img[^>]+?src="/images/previous_day.png"')
187+ help = 'Index format: yyyy/mm/dd'
188
189=== modified file 'dosage/plugins/keenspot.py'
190--- dosage/plugins/keenspot.py 2009-12-15 06:55:27 +0000
191+++ dosage/plugins/keenspot.py 2010-01-04 20:15:24 +0000
192@@ -456,6 +456,7 @@
193 'FanserviceMeteorologyWin': 'http://aod.comicgenesis.com/',
194 'FantasticalBestiary': 'http://fantasticalbestiary.comicgenesis.com/',
195 'FantasyQwest': 'http://creatorauthorman.comicgenesis.com/',
196+ 'FaultyLogic': 'http://faultylogic.comicgenesis.com/',
197 'Feathers': 'http://feathers.comicgenesis.com/',
198 'FelixAndTheKidneyEater': 'http://fnk.comicgenesis.com/',
199 'Fellonist': 'http://thefellonist.comicgenesis.com/',
200
201=== modified file 'dosage/plugins/l.py'
202--- dosage/plugins/l.py 2009-12-15 06:55:27 +0000
203+++ dosage/plugins/l.py 2010-01-04 20:15:24 +0000
204@@ -98,3 +98,21 @@
205 # prevSearch=compile(r'<a href="(index.php\?comicid=\d+)"><img src="/images/gprev.gif"', IGNORECASE),
206 # help='Index format: n (unpadded)',
207 # namer=queryNamer('comicid'))
208+
209+
210+
211+class LegoRobot(BasicScraper):
212+ latestUrl = 'http://www.legorobotcomics.com/'
213+ imageUrl = 'http://www.legorobotcomics.com/?id=%s'
214+ imageSearch = compile(r'id="the_comic" src="(comics/.+?)"')
215+ prevSearch = compile(r'(\?id=\d+)"><img src="images/back.png"')
216+ help = 'Index format: nnnn'
217+
218+
219+
220+class LeastICouldDo(BasicScraper):
221+ latestUrl = 'http://www.leasticoulddo.com/'
222+ imageUrl = 'http://www.leasticoulddo.com/comic/%s'
223+ imageSearch = compile(r'<img src="(http://cdn.leasticoulddo.com/comics/\d{8}.\w{1,4})" />')
224+ prevSearch = compile(r'<a href="(/comic/\d{8})">Previous</a>')
225+ help = 'Index format: yyyymmdd'
226
227=== modified file 'dosage/plugins/w.py'
228--- dosage/plugins/w.py 2009-12-15 06:55:27 +0000
229+++ dosage/plugins/w.py 2010-01-04 20:15:24 +0000
230@@ -104,6 +104,7 @@
231 'NekkoAndJoruba': 'nekkoandjoruba/nekkoandjoruba/',
232 'JaxEpoch': 'johngreen/quicken/',
233 'QuantumRockOfAges': 'DreamchildNYC/quantum/',
234+ 'ClownSamurai' : 'qsamurai/clownsamurai/',
235 }
236
237 return dict((name, WebcomicsNation.make('WebcomicsNation/' + name, latestUrl='http://www.webcomicsnation.com/' + subpath)) for name, subpath in comics.iteritems())
238
239=== modified file 'dosage/test/test_util.py'
240--- dosage/test/test_util.py 2009-12-15 06:55:27 +0000
241+++ dosage/test/test_util.py 2010-01-04 20:15:24 +0000
242@@ -1,6 +1,8 @@
243 from twisted.trial.unittest import TestCase
244
245-from dosage.util import saneDataSize
246+from dosage.util import saneDataSize, normaliseURL, _unescape
247+
248+
249
250 class SizeFormattingTests(TestCase):
251 """
252@@ -15,6 +17,7 @@
253 self.assertEqual(saneDataSize(size), expectedOutput)
254 self.assertEqual(saneDataSize(-size), '-' + expectedOutput)
255
256+
257 def test_verySmallSize(self):
258 """
259 Sizes smaller than a single byte should be formatted as bytes; this
260@@ -22,12 +25,13 @@
261 """
262 self.check(0.1, '0.100 B')
263
264+
265 def test_normalSizes(self):
266 """
267 Sizes should be formatted in the largest unit for which the size will
268 not be less than a single unit.
269 """
270- self.check(1, '1.000 B')
271+ self.check(1, '1.000 B')
272 self.check(2.075 * 2 ** 10, '2.075 kB')
273 self.check(5.88 * 2 ** 20, '5.880 MB')
274 self.check(13.34 * 2 ** 30, '13.340 GB')
275@@ -37,8 +41,31 @@
276 self.check(57.892 * 2 ** 70, '57.892 ZB')
277 self.check(999.99 * 2 ** 80, '999.990 YB')
278
279+
280 def test_veryLargeSize(self):
281 """
282 Sizes larger than 1024 yottabytes should be formatted as yottabytes.
283 """
284 self.check(5567254 * 2 ** 80, '5567254.000 YB')
285+
286+
287+
288+class URLTests(TestCase):
289+ """
290+ Tests for URL utility functions.
291+ """
292+ def test_unescape(self):
293+ """
294+ Test HTML replacement.
295+ """
296+ self.assertEqual(_unescape('foo&amp;bar'), 'foo&bar')
297+ self.assertEqual(_unescape('foo&#160;bar'), 'foo%C2%A0bar')
298+ self.assertEqual(_unescape('&quot;foo&quot;'), '%22foo%22')
299+
300+
301+ def test_normalisation(self):
302+ """
303+ Test URL normalisation.
304+ """
305+ self.assertEqual(normaliseURL('http://foo.com//bar/baz&amp;baz'),
306+ 'http://foo.com/bar/baz&baz')
307
308=== modified file 'dosage/util.py'
309--- dosage/util.py 2009-12-15 06:55:27 +0000
310+++ dosage/util.py 2010-01-04 20:15:24 +0000
311@@ -6,6 +6,8 @@
312 import array
313 import os.path
314 import cgi
315+import re
316+from htmlentitydefs import name2codepoint
317 from time import sleep
318 from math import log, floor
319 from re import compile, IGNORECASE
320@@ -73,8 +75,43 @@
321
322 return xformedGroups
323
324-def normalizeUrl(url):
325- '''Removes any leading empty segments to avoid breaking urllib2.'''
326+
327+def _unescape(text):
328+ """
329+ Replace HTML entities and character references.
330+ """
331+ def _fixup(m):
332+ text = m.group(0)
333+ if text[:2] == "&#":
334+ # character reference
335+ try:
336+ if text[:3] == "&#x":
337+ text = unichr(int(text[3:-1], 16))
338+ else:
339+ text = unichr(int(text[2:-1]))
340+ except ValueError:
341+ pass
342+ else:
343+ # named entity
344+ try:
345+ text = unichr(name2codepoint[text[1:-1]])
346+ except KeyError:
347+ pass
348+ if isinstance(text, unicode):
349+ text = text.encode('utf-8')
350+ text = urllib2.quote(text, safe=';/?:@&=+$,')
351+ return text
352+ return re.sub("&#?\w+;", _fixup, text)
353+
354+
355+def normaliseURL(url):
356+ """
357+ Removes any leading empty segments to avoid breaking urllib2; also replaces
358+ HTML entities and character references.
359+ """
360+ # XXX: brutal hack
361+ url = _unescape(url)
362+
363 pu = list(urlparse.urlparse(url))
364 segments = pu[2].replace(' ', '%20').split('/')
365 while segments and segments[0] == '':
366@@ -82,9 +119,10 @@
367 pu[2] = '/' + '/'.join(segments)
368 return urlparse.urlunparse(pu)
369
370+
371 def urlopen(url, referrer=None, retries=5):
372 # Work around urllib2 brokenness
373- url = normalizeUrl(url)
374+ url = normaliseURL(url)
375 req = urllib2.Request(url)
376 if referrer:
377 req.add_header('Referrer', referrer)

Subscribers

People subscribed via source and target branches

to all changes: