Merge lp:~robru/friends/linkify into lp:~super-friends/friends/raring

Proposed by Robert Bruce Park
Status: Merged
Approved by: Ken VanDine
Approved revision: 178
Merged at revision: 177
Proposed branch: lp:~robru/friends/linkify
Merge into: lp:~super-friends/friends/raring
Diff against target: 176 lines (+116/-3)
3 files modified
friends/tests/test_protocols.py (+76/-1)
friends/tests/test_twitter.py (+3/-2)
friends/utils/base.py (+37/-0)
To merge this branch: bzr merge lp:~robru/friends/linkify
Reviewer Review Type Date Requested Status
Ken VanDine Approve
Review via email: mp+153965@code.launchpad.net

Description of the change

Alright Ken, the regex was a little bit more complicated than I had anticipated, but I've taken steps to ensure that it is readable, and also I've included some quite comprehensive tests against it, so long term maintenance of it shouldn't be too difficult.

It's speedy, too! You were complaining that the frontend was not very performant while trying to linkify URLs, so I did a bit of profiling with this. Seems it handles an empty string in 6.0 usec, a URL-only string in 10.8 usec, an average sentence in 13.8 usec, and max-tweet-length in 16.6 usec.

Also, on top of that, we also get the performance benefit of only having to do it once, and then caching the result (as opposed to the frontend which would have to re-do it every time it displayed any message).

To post a comment you must log in.
lp:~robru/friends/linkify updated
178. By Robert Bruce Park

Make the deduplication logic a little bit more robust.

Revision history for this message
Ken VanDine (ken-vandine) wrote :

Works great!

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'friends/tests/test_protocols.py'
--- friends/tests/test_protocols.py 2013-03-14 19:14:03 +0000
+++ friends/tests/test_protocols.py 2013-03-19 00:45:26 +0000
@@ -27,7 +27,7 @@
27from friends.protocols.flickr import Flickr27from friends.protocols.flickr import Flickr
28from friends.protocols.twitter import Twitter28from friends.protocols.twitter import Twitter
29from friends.tests.mocks import FakeAccount, LogMock, TestModel, mock29from friends.tests.mocks import FakeAccount, LogMock, TestModel, mock
30from friends.utils.base import Base, feature30from friends.utils.base import Base, feature, linkify_string
31from friends.utils.manager import ProtocolManager31from friends.utils.manager import ProtocolManager
32from friends.utils.model import COLUMN_INDICES, Model32from friends.utils.model import COLUMN_INDICES, Model
3333
@@ -392,3 +392,78 @@
392392
393 def test_features(self):393 def test_features(self):
394 self.assertEqual(MyProtocol.get_features(), ['feature_1', 'feature_2'])394 self.assertEqual(MyProtocol.get_features(), ['feature_1', 'feature_2'])
395
396 def test_linkify_string(self):
397 # String with no URL is unchanged.
398 self.assertEqual('Hello!', linkify_string('Hello!'))
399 # http:// works.
400 self.assertEqual(
401 '<a href="http://www.example.com">http://www.example.com</a>',
402 linkify_string('http://www.example.com'))
403 # https:// works, too.
404 self.assertEqual(
405 '<a href="https://www.example.com">https://www.example.com</a>',
406 linkify_string('https://www.example.com'))
407 # http:// is optional if you include www.
408 self.assertEqual(
409 '<a href="www.example.com">www.example.com</a>',
410 linkify_string('www.example.com'))
411 # Haha, nobody uses ftp anymore!
412 self.assertEqual(
413 '<a href="ftp://example.com/">ftp://example.com/</a>',
414 linkify_string('ftp://example.com/'))
415 # Trailing periods are not linkified.
416 self.assertEqual(
417 '<a href="http://example.com">http://example.com</a>.',
418 linkify_string('http://example.com.'))
419 # URL can contain periods without getting cut off.
420 self.assertEqual(
421 '<a href="http://example.com/products/buy.html">'
422 'http://example.com/products/buy.html</a>.',
423 linkify_string('http://example.com/products/buy.html.'))
424 # Don't linkify trailing brackets.
425 self.assertEqual(
426 'Example Co (<a href="http://example.com">http://example.com</a>).',
427 linkify_string('Example Co (http://example.com).'))
428 # Don't linkify trailing exclamation marks.
429 self.assertEqual(
430 'Go to <a href="https://example.com">https://example.com</a>!',
431 linkify_string('Go to https://example.com!'))
432 # Don't linkify trailing commas, also ensure all links are found.
433 self.assertEqual(
434 '<a href="www.example.com">www.example.com</a>, <a '
435 'href="http://example.com/stuff">http://example.com/stuff</a>, and '
436 '<a href="http://example.com/things">http://example.com/things</a> '
437 'are my favorite sites.',
438 linkify_string('www.example.com, http://example.com/stuff, and '
439 'http://example.com/things are my favorite sites.'))
440 # Don't linkify trailing question marks.
441 self.assertEqual(
442 'Ever been to <a href="www.example.com">www.example.com</a>?',
443 linkify_string('Ever been to www.example.com?'))
444 # URLs can contain question marks ok.
445 self.assertEqual(
446 'Like <a href="http://example.com?foo=bar&grill=true">'
447 'http://example.com?foo=bar&grill=true</a>?',
448 linkify_string('Like http://example.com?foo=bar&grill=true?'))
449 # Multi-line strings are also supported.
450 self.assertEqual(
451 'Hey, visit us online!\n\n'
452 '<a href="http://example.com">http://example.com</a>',
453 linkify_string('Hey, visit us online!\n\nhttp://example.com'))
454 # Don't accidentally duplicate linkification.
455 self.assertEqual(
456 '<a href="www.example.com">click here!</a>',
457 linkify_string('<a href="www.example.com">click here!</a>'))
458 self.assertEqual(
459 '<a href="www.example.com">www.example.com</a>',
460 linkify_string('<a href="www.example.com">www.example.com</a>'))
461 self.assertEqual(
462 '<a href="www.example.com">www.example.com</a> is our website',
463 linkify_string(
464 '<a href="www.example.com">www.example.com</a> is our website'))
465 # This, apparently, is valid HTML.
466 self.assertEqual(
467 '<a href = "www.example.com">www.example.com</a>',
468 linkify_string(
469 '<a href = "www.example.com">www.example.com</a>'))
395470
=== modified file 'friends/tests/test_twitter.py'
--- friends/tests/test_twitter.py 2013-03-14 19:14:03 +0000
+++ friends/tests/test_twitter.py 2013-03-19 00:45:26 +0000
@@ -141,8 +141,9 @@
141 ],141 ],
142 ['twitter', 88, '240556426106372096',142 ['twitter', 88, '240556426106372096',
143 'messages', 'Raffi Krikorian', '8285392', 'raffi', False,143 'messages', 'Raffi Krikorian', '8285392', 'raffi', False,
144 '2012-08-28T21:08:15Z', 'lecturing at the "analyzing big data ' +144 '2012-08-28T21:08:15Z', 'lecturing at the "analyzing big data '
145 'with twitter" class at @cal with @othman http://t.co/bfj7zkDJ',145 'with twitter" class at @cal with @othman '
146 '<a href="http://t.co/bfj7zkDJ">http://t.co/bfj7zkDJ</a>',
146 GLib.get_user_cache_dir() +147 GLib.get_user_cache_dir() +
147 '/friends/avatars/0219effc03a3049a622476e6e001a4014f33dc31',148 '/friends/avatars/0219effc03a3049a622476e6e001a4014f33dc31',
148 'https://twitter.com/raffi/status/240556426106372096',149 'https://twitter.com/raffi/status/240556426106372096',
149150
=== modified file 'friends/utils/base.py'
--- friends/utils/base.py 2013-03-14 13:29:53 +0000
+++ friends/utils/base.py 2013-03-19 00:45:26 +0000
@@ -23,6 +23,7 @@
23 ]23 ]
2424
2525
26import re
26import time27import time
27import logging28import logging
28import threading29import threading
@@ -51,6 +52,35 @@
51ACCT_IDX = COLUMN_INDICES['account_id']52ACCT_IDX = COLUMN_INDICES['account_id']
52TIME_IDX = COLUMN_INDICES['timestamp']53TIME_IDX = COLUMN_INDICES['timestamp']
5354
55# See friends/tests/test_protocols.py for further documentation
56LINKIFY_REGEX = re.compile(
57 r"""
58 # Do not match if URL is preceded by '"' or '>'
59 # This is used to prevent duplication of linkification.
60 (?<![\"\>])
61 # Record everything that we're about to match.
62 (
63 # URLs can start with 'http://', 'https://', 'ftp://', or 'www.'
64 (?:(?:https?|ftp)://|www\.)
65 # Match many non-whitespace characters, but not greedily.
66 (?:\S+?)
67 # Stop recording the match.
68 )
69 # This section will peek ahead (without matching) in order to
70 # determine precisely where the URL actually *ends*.
71 (?=
72 # Do not include any trailing period, comma, exclamation mark,
73 # question mark, or closing parentheses, if any are present.
74 [.,!?\)]*
75 # With "trailing" defined as immediately preceding the first
76 # space, or end-of-string.
77 (?:\s|$)
78 # But abort the whole thing if the URL ends with '</a>',
79 # again to prevent duplication of linkification.
80 (?!</a>)
81 )""",
82 flags=re.VERBOSE).sub
83
5484
55# This is a mapping from message_ids to DeeModel row index ints. It is85# This is a mapping from message_ids to DeeModel row index ints. It is
56# used for quickly and easily preventing the same message from being86# used for quickly and easily preventing the same message from being
@@ -101,6 +131,11 @@
101 log.debug('_seen_ids: {}'.format(len(_seen_ids)))131 log.debug('_seen_ids: {}'.format(len(_seen_ids)))
102132
103133
134def linkify_string(string):
135 """Finds all URLs in a string and turns them into HTML links."""
136 return LINKIFY_REGEX(r'<a href="\1">\1</a>', string)
137
138
104class _OperationThread(threading.Thread):139class _OperationThread(threading.Thread):
105 """Manage async callbacks, and log subthread exceptions."""140 """Manage async callbacks, and log subthread exceptions."""
106 # main.py will replace this with a reference to the mainloop.quit method141 # main.py will replace this with a reference to the mainloop.quit method
@@ -324,6 +359,8 @@
324 account_id=self._account.id359 account_id=self._account.id
325 )360 )
326 )361 )
362 # linkify the message
363 kwargs['message'] = linkify_string(kwargs.get('message', ''))
327 args = []364 args = []
328 # Now iterate through all the column names listed in the365 # Now iterate through all the column names listed in the
329 # SCHEMA, and pop matching column values from the kwargs, in366 # SCHEMA, and pop matching column values from the kwargs, in

Subscribers

People subscribed via source and target branches