Merge lp:~robru/friends/linkify into lp:~super-friends/friends/raring

Proposed by Robert Bruce Park
Status: Merged
Approved by: Ken VanDine
Approved revision: 178
Merged at revision: 177
Proposed branch: lp:~robru/friends/linkify
Merge into: lp:~super-friends/friends/raring
Diff against target: 176 lines (+116/-3)
3 files modified
friends/tests/test_protocols.py (+76/-1)
friends/tests/test_twitter.py (+3/-2)
friends/utils/base.py (+37/-0)
To merge this branch: bzr merge lp:~robru/friends/linkify
Reviewer Review Type Date Requested Status
Ken VanDine Approve
Review via email: mp+153965@code.launchpad.net

Description of the change

Alright Ken, the regex was a little bit more complicated than I had anticipated, but I've taken steps to ensure that it is readable, and also I've included some quite comprehensive tests against it, so long term maintenance of it shouldn't be too difficult.

It's speedy, too! You were complaining that the frontend was not very performant while trying to linkify URLs, so I did a bit of profiling with this. Seems it handles an empty string in 6.0 usec, a URL-only string in 10.8 usec, an average sentence in 13.8 usec, and max-tweet-length in 16.6 usec.

Also, on top of that, we also get the performance benefit of only having to do it once, and then caching the result (as opposed to the frontend which would have to re-do it every time it displayed any message).

To post a comment you must log in.
lp:~robru/friends/linkify updated
178. By Robert Bruce Park

Make the deduplication logic a little bit more robust.

Revision history for this message
Ken VanDine (ken-vandine) wrote :

Works great!

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'friends/tests/test_protocols.py'
2--- friends/tests/test_protocols.py 2013-03-14 19:14:03 +0000
3+++ friends/tests/test_protocols.py 2013-03-19 00:45:26 +0000
4@@ -27,7 +27,7 @@
5 from friends.protocols.flickr import Flickr
6 from friends.protocols.twitter import Twitter
7 from friends.tests.mocks import FakeAccount, LogMock, TestModel, mock
8-from friends.utils.base import Base, feature
9+from friends.utils.base import Base, feature, linkify_string
10 from friends.utils.manager import ProtocolManager
11 from friends.utils.model import COLUMN_INDICES, Model
12
13@@ -392,3 +392,78 @@
14
15 def test_features(self):
16 self.assertEqual(MyProtocol.get_features(), ['feature_1', 'feature_2'])
17+
18+ def test_linkify_string(self):
19+ # String with no URL is unchanged.
20+ self.assertEqual('Hello!', linkify_string('Hello!'))
21+ # http:// works.
22+ self.assertEqual(
23+ '<a href="http://www.example.com">http://www.example.com</a>',
24+ linkify_string('http://www.example.com'))
25+ # https:// works, too.
26+ self.assertEqual(
27+ '<a href="https://www.example.com">https://www.example.com</a>',
28+ linkify_string('https://www.example.com'))
29+ # http:// is optional if you include www.
30+ self.assertEqual(
31+ '<a href="www.example.com">www.example.com</a>',
32+ linkify_string('www.example.com'))
33+ # Haha, nobody uses ftp anymore!
34+ self.assertEqual(
35+ '<a href="ftp://example.com/">ftp://example.com/</a>',
36+ linkify_string('ftp://example.com/'))
37+ # Trailing periods are not linkified.
38+ self.assertEqual(
39+ '<a href="http://example.com">http://example.com</a>.',
40+ linkify_string('http://example.com.'))
41+ # URL can contain periods without getting cut off.
42+ self.assertEqual(
43+ '<a href="http://example.com/products/buy.html">'
44+ 'http://example.com/products/buy.html</a>.',
45+ linkify_string('http://example.com/products/buy.html.'))
46+ # Don't linkify trailing brackets.
47+ self.assertEqual(
48+ 'Example Co (<a href="http://example.com">http://example.com</a>).',
49+ linkify_string('Example Co (http://example.com).'))
50+ # Don't linkify trailing exclamation marks.
51+ self.assertEqual(
52+ 'Go to <a href="https://example.com">https://example.com</a>!',
53+ linkify_string('Go to https://example.com!'))
54+ # Don't linkify trailing commas, also ensure all links are found.
55+ self.assertEqual(
56+ '<a href="www.example.com">www.example.com</a>, <a '
57+ 'href="http://example.com/stuff">http://example.com/stuff</a>, and '
58+ '<a href="http://example.com/things">http://example.com/things</a> '
59+ 'are my favorite sites.',
60+ linkify_string('www.example.com, http://example.com/stuff, and '
61+ 'http://example.com/things are my favorite sites.'))
62+ # Don't linkify trailing question marks.
63+ self.assertEqual(
64+ 'Ever been to <a href="www.example.com">www.example.com</a>?',
65+ linkify_string('Ever been to www.example.com?'))
66+ # URLs can contain question marks ok.
67+ self.assertEqual(
68+ 'Like <a href="http://example.com?foo=bar&grill=true">'
69+ 'http://example.com?foo=bar&grill=true</a>?',
70+ linkify_string('Like http://example.com?foo=bar&grill=true?'))
71+ # Multi-line strings are also supported.
72+ self.assertEqual(
73+ 'Hey, visit us online!\n\n'
74+ '<a href="http://example.com">http://example.com</a>',
75+ linkify_string('Hey, visit us online!\n\nhttp://example.com'))
76+ # Don't accidentally duplicate linkification.
77+ self.assertEqual(
78+ '<a href="www.example.com">click here!</a>',
79+ linkify_string('<a href="www.example.com">click here!</a>'))
80+ self.assertEqual(
81+ '<a href="www.example.com">www.example.com</a>',
82+ linkify_string('<a href="www.example.com">www.example.com</a>'))
83+ self.assertEqual(
84+ '<a href="www.example.com">www.example.com</a> is our website',
85+ linkify_string(
86+ '<a href="www.example.com">www.example.com</a> is our website'))
87+ # This, apparently, is valid HTML.
88+ self.assertEqual(
89+ '<a href = "www.example.com">www.example.com</a>',
90+ linkify_string(
91+ '<a href = "www.example.com">www.example.com</a>'))
92
93=== modified file 'friends/tests/test_twitter.py'
94--- friends/tests/test_twitter.py 2013-03-14 19:14:03 +0000
95+++ friends/tests/test_twitter.py 2013-03-19 00:45:26 +0000
96@@ -141,8 +141,9 @@
97 ],
98 ['twitter', 88, '240556426106372096',
99 'messages', 'Raffi Krikorian', '8285392', 'raffi', False,
100- '2012-08-28T21:08:15Z', 'lecturing at the "analyzing big data ' +
101- 'with twitter" class at @cal with @othman http://t.co/bfj7zkDJ',
102+ '2012-08-28T21:08:15Z', 'lecturing at the "analyzing big data '
103+ 'with twitter" class at @cal with @othman '
104+ '<a href="http://t.co/bfj7zkDJ">http://t.co/bfj7zkDJ</a>',
105 GLib.get_user_cache_dir() +
106 '/friends/avatars/0219effc03a3049a622476e6e001a4014f33dc31',
107 'https://twitter.com/raffi/status/240556426106372096',
108
109=== modified file 'friends/utils/base.py'
110--- friends/utils/base.py 2013-03-14 13:29:53 +0000
111+++ friends/utils/base.py 2013-03-19 00:45:26 +0000
112@@ -23,6 +23,7 @@
113 ]
114
115
116+import re
117 import time
118 import logging
119 import threading
120@@ -51,6 +52,35 @@
121 ACCT_IDX = COLUMN_INDICES['account_id']
122 TIME_IDX = COLUMN_INDICES['timestamp']
123
124+# See friends/tests/test_protocols.py for further documentation
125+LINKIFY_REGEX = re.compile(
126+ r"""
127+ # Do not match if URL is preceded by '"' or '>'
128+ # This is used to prevent duplication of linkification.
129+ (?<![\"\>])
130+ # Record everything that we're about to match.
131+ (
132+ # URLs can start with 'http://', 'https://', 'ftp://', or 'www.'
133+ (?:(?:https?|ftp)://|www\.)
134+ # Match many non-whitespace characters, but not greedily.
135+ (?:\S+?)
136+ # Stop recording the match.
137+ )
138+ # This section will peek ahead (without matching) in order to
139+ # determine precisely where the URL actually *ends*.
140+ (?=
141+ # Do not include any trailing period, comma, exclamation mark,
142+ # question mark, or closing parentheses, if any are present.
143+ [.,!?\)]*
144+ # With "trailing" defined as immediately preceding the first
145+ # space, or end-of-string.
146+ (?:\s|$)
147+ # But abort the whole thing if the URL ends with '</a>',
148+ # again to prevent duplication of linkification.
149+ (?!</a>)
150+ )""",
151+ flags=re.VERBOSE).sub
152+
153
154 # This is a mapping from message_ids to DeeModel row index ints. It is
155 # used for quickly and easily preventing the same message from being
156@@ -101,6 +131,11 @@
157 log.debug('_seen_ids: {}'.format(len(_seen_ids)))
158
159
160+def linkify_string(string):
161+ """Finds all URLs in a string and turns them into HTML links."""
162+ return LINKIFY_REGEX(r'<a href="\1">\1</a>', string)
163+
164+
165 class _OperationThread(threading.Thread):
166 """Manage async callbacks, and log subthread exceptions."""
167 # main.py will replace this with a reference to the mainloop.quit method
168@@ -324,6 +359,8 @@
169 account_id=self._account.id
170 )
171 )
172+ # linkify the message
173+ kwargs['message'] = linkify_string(kwargs.get('message', ''))
174 args = []
175 # Now iterate through all the column names listed in the
176 # SCHEMA, and pop matching column values from the kwargs, in

Subscribers

People subscribed via source and target branches