Merge lp:~stolowski/unity-lens-shopping/markup-cleaner-fix into lp:unity-lens-shopping

Proposed by Paweł Stołowski
Status: Merged
Approved by: Michal Hruby
Approved revision: 31
Merged at revision: 26
Proposed branch: lp:~stolowski/unity-lens-shopping/markup-cleaner-fix
Merge into: lp:unity-lens-shopping
Diff against target: 345 lines (+273/-26)
5 files modified
Makefile.am (+1/-1)
configure.ac (+1/-0)
src/markup-cleaner.vala (+59/-25)
tests/unit/Makefile.am (+28/-0)
tests/unit/test-markup-cleaner.vala (+184/-0)
To merge this branch: bzr merge lp:~stolowski/unity-lens-shopping/markup-cleaner-fix
Reviewer Review Type Date Requested Status
Michal Hruby (community) Approve
Review via email: mp+127275@code.launchpad.net

Commit message

Fixed regular expression for capturing html tags; fixed replace_cb logic and added unit tests for MarkupCleaner.

Description of the change

Fixed regular expression for capturing html tags; fixed replace_cb logic and added unit tests for MarkupCleaner.

To post a comment you must log in.
Revision history for this message
Michal Hruby (mhr3) wrote :

40 + internal static const string HTML_MARKUP_RE = "(</?)\\s*([^>]*?)\\s*(/?>)|(\\&(?!(([a-z]+)|(#\\d+));))";

> (?!(([a-z]+) -> perhaps (?!(\S{1,6})) instead? (the entities can be uppercase and have digits)

Pls add `<a href="wooo"><small>Click me!</small></a>` to the unit tests.

review: Needs Fixing
28. By Paweł Stołowski

Also support entities with numbers. Added test case for SMALL tag surrounded by A tag.

Revision history for this message
Michal Hruby (mhr3) wrote :

> (?!(([a-z]+) -> perhaps (?!(\S{1,6})) instead? (the entities can be uppercase and have digits)

As discussed on IRC pango/glib recognizes only 5 named entities, will throw an error otherwise (amp, gt, lt, apos, quote) .

29. By Paweł Stołowski

It turns out pango supports a limited set of entities, so filter all named entitites except for amp, lt, gt, quot, apos.

30. By Paweł Stołowski

Use named position in the regex for better readability.

31. By Paweł Stołowski

Lowercase entity names. Unknown entities are converted to raw string instead of discarding.

Revision history for this message
Michal Hruby (mhr3) wrote :

Great now! +1

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'Makefile.am'
2--- Makefile.am 2012-07-25 08:13:14 +0000
3+++ Makefile.am 2012-10-01 15:11:29 +0000
4@@ -1,4 +1,4 @@
5-SUBDIRS = src data po
6+SUBDIRS = src data po tests/unit
7
8 #
9 # Install the shopping.lens
10
11=== modified file 'configure.ac'
12--- configure.ac 2012-09-26 15:33:10 +0000
13+++ configure.ac 2012-10-01 15:11:29 +0000
14@@ -121,6 +121,7 @@
15 src/config.vala
16 vapi/Makefile
17 po/Makefile.in
18+ tests/unit/Makefile
19 ])
20 AC_OUTPUT
21
22
23=== modified file 'src/markup-cleaner.vala'
24--- src/markup-cleaner.vala 2012-09-18 09:31:25 +0000
25+++ src/markup-cleaner.vala 2012-10-01 15:11:29 +0000
26@@ -23,37 +23,71 @@
27 {
28 /**
29 * Regex for capturing HTML tags. It's closely related to the logic in replace_cb,
30- * so be careful with any changes to it.
31- * First three parts of the regex capture <..> tags; there are three variants of it, because
32- * we want to capture the actual tag name, without <> and /, so <foo/>, </foo> and <foo> are
33- * treated separately in the regex.
34- * The last part captures &, unless it's &amp; already.
35+ * so be careful with any changes to it; don't forget to run unit tests!
36+ * First part of the regex captures <..> tags.
37+ * The last part captures '&' unless it's an entity.
38 */
39- internal static const string HTML_MARKUP_RE = "<(.*?)\\s*/>|</\\s*(.*?)>|<(.*?)>|(\\&(?!amp;))";
40+
41+ internal static const string HTML_MARKUP_RE = "(?'OPENING_BRACKET'</?)\\s*(?'TAG_NAME'[^>]*?)\\s*(?'CLOSING_BRACKET'/?>)|(?'ENTITY'\\&(?'ENTITY_NAME'[a-zA-Z0-9]+);)|(?'AMPERSAND'\\&(?!(#\\d+)))";
42
43 internal static bool replace_cb (MatchInfo match, StringBuilder result)
44 {
45- string? entire_substring = match.fetch (0);
46- if (match.get_match_count () > 1 && entire_substring != null)
47- {
48- if (entire_substring == "&")
49+ string? entire_substring = match.fetch (0).down ();
50+
51+ /* html <..> tags */
52+ if (match.get_match_count () == 4 && entire_substring != null)
53+ {
54+ string? opening_bracket = match.fetch_named ("OPENING_BRACKET");
55+ string? html_tag_name = match.fetch_named ("TAG_NAME").down ();
56+ string? closing_bracket = match.fetch_named ("CLOSING_BRACKET");
57+
58+ if (html_tag_name == "br")
59+ {
60+ result.append ("\n");
61+ }
62+ else if (html_tag_name == "b" ||
63+ html_tag_name == "i" ||
64+ html_tag_name == "u" ||
65+ html_tag_name == "s" ||
66+ html_tag_name == "tt" ||
67+ html_tag_name == "big" ||
68+ html_tag_name == "small" ||
69+ html_tag_name == "sup" ||
70+ html_tag_name == "sub")
71+ {
72+ result.append (entire_substring);
73+ }
74+ else if (html_tag_name == "strike")
75+ {
76+ result.append (opening_bracket + "s" + closing_bracket);
77+ }
78+ }
79+ else if (match.get_match_count () == 6)
80+ {
81+ string entity_name = match.fetch_named ("ENTITY_NAME").down ();
82+ if (entity_name == "amp" ||
83+ entity_name == "quot" ||
84+ entity_name == "apos" ||
85+ entity_name == "lt" ||
86+ entity_name == "gt")
87+ {
88+ result.append (match.fetch_named ("ENTITY").down ());
89+ }
90+ else if (entity_name == "nbsp")
91+ {
92+ result.append (" ");
93+ }
94+ else //append raw entity name with &amp; prefix
95 {
96 result.append ("&amp;");
97- }
98- else /* html <..> tags */
99- {
100- string? html_tag_name = match.fetch (1).down ();
101- if (html_tag_name == "br")
102- {
103- result.append ("\n");
104- }
105- else if (html_tag_name == "b")
106- {
107- result.append (entire_substring);
108- }
109- }
110- /* else - ignore all other html tags */
111- }
112+ result.append (entity_name);
113+ }
114+ }
115+ else if (match.get_match_count () == 7 && match.fetch_named ("AMPERSAND") == "&")
116+ {
117+ result.append ("&amp;");
118+ }
119+ /* else - ignore all other html tags */
120 return false;
121 }
122
123
124=== added directory 'tests/unit'
125=== added file 'tests/unit/Makefile.am'
126--- tests/unit/Makefile.am 1970-01-01 00:00:00 +0000
127+++ tests/unit/Makefile.am 2012-10-01 15:11:29 +0000
128@@ -0,0 +1,28 @@
129+check_PROGRAMS = test-markup-cleaner
130+TESTS = $(check_PROGRAMS)
131+
132+test_markup_cleaner_SOURCES = \
133+ test-markup-cleaner.vala \
134+ $(top_srcdir)/src/markup-cleaner.vala
135+ $(NULL)
136+
137+test_markup_cleaner_CPPFLAGS = \
138+ -w \
139+ $(LENS_DAEMON_CFLAGS) \
140+ -I$(srcdir) \
141+ $(NULL)
142+
143+AM_VALAFLAGS = \
144+ -C \
145+ --vapidir $(top_srcdir)/vapi \
146+ --pkg glib-2.0 \
147+ --pkg gio-2.0 \
148+ --pkg gio-unix-2.0 \
149+ --target-glib=2.26 \
150+ $(NULL)
151+
152+test_markup_cleaner_LDADD = \
153+ $(LENS_DAEMON_LIBS) \
154+ $(test_libs)
155+
156+CLEANFILES = *.stamp
157
158=== added file 'tests/unit/test-markup-cleaner.vala'
159--- tests/unit/test-markup-cleaner.vala 1970-01-01 00:00:00 +0000
160+++ tests/unit/test-markup-cleaner.vala 2012-10-01 15:11:29 +0000
161@@ -0,0 +1,184 @@
162+/*
163+ * Copyright (C) 2012 Canonical Ltd
164+ *
165+ * This program is free software: you can redistribute it and/or modify
166+ * it under the terms of the GNU General Public License version 3 as
167+ * published by the Free Software Foundation.
168+ *
169+ * This program is distributed in the hope that it will be useful,
170+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
171+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
172+ * GNU General Public License for more details.
173+ *
174+ * You should have received a copy of the GNU General Public License
175+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
176+ *
177+ * Authored by Pawel Stolowski <pawel.stolowski@canonical.com>
178+ *
179+ */
180+
181+using Unity.ShoppingLens;
182+
183+public class Main
184+{
185+ public static int main (string[] args)
186+ {
187+ Test.init (ref args);
188+
189+ Test.add_data_func ("/Unit/NoMarkup", test_no_markup);
190+ Test.add_data_func ("/Unit/BrTagSupport", test_br_tag_support);
191+ Test.add_data_func ("/Unit/BTagSupport", test_b_tag_support);
192+ Test.add_data_func ("/Unit/ITagSupport", test_i_tag_support);
193+ Test.add_data_func ("/Unit/UTagSupport", test_u_tag_support);
194+ Test.add_data_func ("/Unit/TtTagSupport", test_tt_tag_support);
195+ Test.add_data_func ("/Unit/STagSupport", test_s_tag_support);
196+ Test.add_data_func ("/Unit/StrikeTagSupport", test_strike_tag_support);
197+ Test.add_data_func ("/Unit/BigTagSupport", test_big_tag_support);
198+ Test.add_data_func ("/Unit/SmallTagSupport", test_small_tag_support);
199+ Test.add_data_func ("/Unit/SubTagSupport", test_sub_tag_support);
200+ Test.add_data_func ("/Unit/SupTagSupport", test_sup_tag_support);
201+ Test.add_data_func ("/Unit/UnsupportedTags", test_unsupported_tags);
202+ Test.add_data_func ("/Unit/NestedTags", test_nested_tags);
203+ Test.add_data_func ("/Unit/AmpEntitySupport", test_amp_entity);
204+ Test.add_data_func ("/Unit/NbspEntitySupport", test_nbsp_entity);
205+ Test.add_data_func ("/Unit/EntitiesArePreserved", test_basic_entities_are_preserved);
206+ Test.add_data_func ("/Unit/UnsupportedEntitiesAreRaw", test_unsupported_entities_are_raw);
207+ Test.add_data_func ("/Unit/NumericEntitiesArePreserved", test_num_entities_are_preserved);
208+
209+ Test.run ();
210+ return 0;
211+ }
212+
213+ internal static void test_no_markup ()
214+ {
215+ string input = "This is a\ntest";
216+ string result = MarkupCleaner.html_to_pango_markup (input);
217+ assert (result == "This is a\ntest");
218+ }
219+
220+ internal static void test_br_tag_support ()
221+ {
222+ string input = "This is<br/> a<br />test<br>";
223+ string result = MarkupCleaner.html_to_pango_markup (input);
224+ assert (result == "This is\n a\ntest\n");
225+ }
226+
227+ internal static void test_b_tag_support ()
228+ {
229+ string input = "<B>T</b>his <b>is</b> a <B>test</B>";
230+ string result = MarkupCleaner.html_to_pango_markup (input);
231+ assert (result == "<b>T</b>his <b>is</b> a <b>test</b>");
232+ }
233+
234+ internal static void test_i_tag_support ()
235+ {
236+ string input = "<I>T</i>his <i>is</i> a <I>test</I>";
237+ string result = MarkupCleaner.html_to_pango_markup (input);
238+ assert (result == "<i>T</i>his <i>is</i> a <i>test</i>");
239+ }
240+
241+ internal static void test_u_tag_support ()
242+ {
243+ string input = "<U>T</u>his <u>is</u> a <U>test</U>";
244+ string result = MarkupCleaner.html_to_pango_markup (input);
245+ assert (result == "<u>T</u>his <u>is</u> a <u>test</u>");
246+ }
247+
248+ internal static void test_tt_tag_support ()
249+ {
250+ string input = "<TT>T</TT>his <tt>is</tt> a <tT>test</Tt>";
251+ string result = MarkupCleaner.html_to_pango_markup (input);
252+ assert (result == "<tt>T</tt>his <tt>is</tt> a <tt>test</tt>");
253+ }
254+
255+ internal static void test_s_tag_support ()
256+ {
257+ string input = "<S>T</s>his <s>is</s> a <S>test</S>";
258+ string result = MarkupCleaner.html_to_pango_markup (input);
259+ assert (result == "<s>T</s>his <s>is</s> a <s>test</s>");
260+ }
261+
262+ internal static void test_strike_tag_support ()
263+ {
264+ string input = "<STRIKE>T</STRIKE>his <strike>is</strike> a <STRike>test</Strike>";
265+ string result = MarkupCleaner.html_to_pango_markup (input);
266+ assert (result == "<s>T</s>his <s>is</s> a <s>test</s>");
267+ }
268+
269+ internal static void test_small_tag_support ()
270+ {
271+ string input = "<SMALL>T</SMALL>his <small>is</small> a <SMall>test</SmaLL>";
272+ string result = MarkupCleaner.html_to_pango_markup (input);
273+ assert (result == "<small>T</small>his <small>is</small> a <small>test</small>");
274+ }
275+
276+ internal static void test_big_tag_support ()
277+ {
278+ string input = "<BIG>T></BIG>his <big>is</big> a <bIG>test</BiG>";
279+ string result = MarkupCleaner.html_to_pango_markup (input);
280+ assert (result == "<big>T></big>his <big>is</big> a <big>test</big>");
281+ }
282+
283+ internal static void test_sub_tag_support ()
284+ {
285+ string input = "<SUB>T</SUB>his <sub>is</sub> a <suB>test</SuB>";
286+ string result = MarkupCleaner.html_to_pango_markup (input);
287+ assert (result == "<sub>T</sub>his <sub>is</sub> a <sub>test</sub>");
288+ }
289+
290+ internal static void test_sup_tag_support ()
291+ {
292+ string input = "<SUP>T</SUP>his <sup>is</sup> a <suP>test</SuP>";
293+ string result = MarkupCleaner.html_to_pango_markup (input);
294+ assert (result == "<sup>T</sup>his <sup>is</sup> a <sup>test</sup>");
295+ }
296+
297+ internal static void test_unsupported_tags ()
298+ {
299+ string input = "<foo>This</bar> is a <a href=\"http://foo.com\">test</a>";
300+ string result = MarkupCleaner.html_to_pango_markup (input);
301+ assert (result == "This is a test");
302+ }
303+
304+ internal static void test_nested_tags ()
305+ {
306+ string input = "<a href=\"wooo\"><small>Click me!</small></a>";
307+ string result = MarkupCleaner.html_to_pango_markup (input);
308+ assert (result == "<small>Click me!</small>");
309+ }
310+
311+ internal static void test_amp_entity ()
312+ {
313+ string input = "Foo & Bar";
314+ string result = MarkupCleaner.html_to_pango_markup (input);
315+ assert (result == "Foo &amp; Bar");
316+ }
317+
318+ internal static void test_nbsp_entity ()
319+ {
320+ string input = "Foo&nbsp;Bar";
321+ string result = MarkupCleaner.html_to_pango_markup (input);
322+ assert (result == "Foo Bar");
323+ }
324+
325+ internal static void test_basic_entities_are_preserved ()
326+ {
327+ string input = "Foo &amp; bar &lt; &GT; &Quot; &apos;";
328+ string result = MarkupCleaner.html_to_pango_markup (input);
329+ assert (result == "Foo &amp; bar &lt; &gt; &quot; &apos;");
330+ }
331+
332+ internal static void test_unsupported_entities_are_raw ()
333+ {
334+ string input = "Foo &frac14; bar &not;";
335+ string result = MarkupCleaner.html_to_pango_markup (input);
336+ assert (result == "Foo &amp;frac14 bar &amp;not");
337+ }
338+
339+ internal static void test_num_entities_are_preserved ()
340+ {
341+ string input = "Foo&#160;bar &#8364;";
342+ string result = MarkupCleaner.html_to_pango_markup (input);
343+ assert (result == "Foo&#160;bar &#8364;");
344+ }
345+}

Subscribers

People subscribed via source and target branches