Merge lp:~stolowski/unity-lens-shopping/markup-cleaner-fix into lp:unity-lens-shopping

Proposed by Paweł Stołowski
Status: Merged
Approved by: Michal Hruby
Approved revision: 31
Merged at revision: 26
Proposed branch: lp:~stolowski/unity-lens-shopping/markup-cleaner-fix
Merge into: lp:unity-lens-shopping
Diff against target: 345 lines (+273/-26)
5 files modified
Makefile.am (+1/-1)
configure.ac (+1/-0)
src/markup-cleaner.vala (+59/-25)
tests/unit/Makefile.am (+28/-0)
tests/unit/test-markup-cleaner.vala (+184/-0)
To merge this branch: bzr merge lp:~stolowski/unity-lens-shopping/markup-cleaner-fix
Reviewer Review Type Date Requested Status
Michal Hruby (community) Approve
Review via email: mp+127275@code.launchpad.net

Commit message

Fixed regular expression for capturing html tags; fixed replace_cb logic and added unit tests for MarkupCleaner.

Description of the change

Fixed regular expression for capturing html tags; fixed replace_cb logic and added unit tests for MarkupCleaner.

To post a comment you must log in.
Revision history for this message
Michal Hruby (mhr3) wrote :

40 + internal static const string HTML_MARKUP_RE = "(</?)\\s*([^>]*?)\\s*(/?>)|(\\&(?!(([a-z]+)|(#\\d+));))";

> (?!(([a-z]+) -> perhaps (?!(\S{1,6})) instead? (the entities can be uppercase and have digits)

Pls add `<a href="wooo"><small>Click me!</small></a>` to the unit tests.

review: Needs Fixing
28. By Paweł Stołowski

Also support entities with numbers. Added test case for SMALL tag surrounded by A tag.

Revision history for this message
Michal Hruby (mhr3) wrote :

> (?!(([a-z]+) -> perhaps (?!(\S{1,6})) instead? (the entities can be uppercase and have digits)

As discussed on IRC pango/glib recognizes only 5 named entities, will throw an error otherwise (amp, gt, lt, apos, quote) .

29. By Paweł Stołowski

It turns out pango supports a limited set of entities, so filter all named entitites except for amp, lt, gt, quot, apos.

30. By Paweł Stołowski

Use named position in the regex for better readability.

31. By Paweł Stołowski

Lowercase entity names. Unknown entities are converted to raw string instead of discarding.

Revision history for this message
Michal Hruby (mhr3) wrote :

Great now! +1

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'Makefile.am'
--- Makefile.am 2012-07-25 08:13:14 +0000
+++ Makefile.am 2012-10-01 15:11:29 +0000
@@ -1,4 +1,4 @@
1SUBDIRS = src data po1SUBDIRS = src data po tests/unit
22
3#3#
4# Install the shopping.lens4# Install the shopping.lens
55
=== modified file 'configure.ac'
--- configure.ac 2012-09-26 15:33:10 +0000
+++ configure.ac 2012-10-01 15:11:29 +0000
@@ -121,6 +121,7 @@
121 src/config.vala121 src/config.vala
122 vapi/Makefile122 vapi/Makefile
123 po/Makefile.in123 po/Makefile.in
124 tests/unit/Makefile
124])125])
125AC_OUTPUT126AC_OUTPUT
126127
127128
=== modified file 'src/markup-cleaner.vala'
--- src/markup-cleaner.vala 2012-09-18 09:31:25 +0000
+++ src/markup-cleaner.vala 2012-10-01 15:11:29 +0000
@@ -23,37 +23,71 @@
23 {23 {
24 /**24 /**
25 * Regex for capturing HTML tags. It's closely related to the logic in replace_cb,25 * Regex for capturing HTML tags. It's closely related to the logic in replace_cb,
26 * so be careful with any changes to it.26 * so be careful with any changes to it; don't forget to run unit tests!
27 * First three parts of the regex capture <..> tags; there are three variants of it, because27 * First part of the regex captures <..> tags.
28 * we want to capture the actual tag name, without <> and /, so <foo/>, </foo> and <foo> are28 * The last part captures '&' unless it's an entity.
29 * treated separately in the regex.
30 * The last part captures &, unless it's &amp; already.
31 */29 */
32 internal static const string HTML_MARKUP_RE = "<(.*?)\\s*/>|</\\s*(.*?)>|<(.*?)>|(\\&(?!amp;))";30
31 internal static const string HTML_MARKUP_RE = "(?'OPENING_BRACKET'</?)\\s*(?'TAG_NAME'[^>]*?)\\s*(?'CLOSING_BRACKET'/?>)|(?'ENTITY'\\&(?'ENTITY_NAME'[a-zA-Z0-9]+);)|(?'AMPERSAND'\\&(?!(#\\d+)))";
3332
34 internal static bool replace_cb (MatchInfo match, StringBuilder result)33 internal static bool replace_cb (MatchInfo match, StringBuilder result)
35 {34 {
36 string? entire_substring = match.fetch (0);35 string? entire_substring = match.fetch (0).down ();
37 if (match.get_match_count () > 1 && entire_substring != null)36
38 {37 /* html <..> tags */
39 if (entire_substring == "&")38 if (match.get_match_count () == 4 && entire_substring != null)
39 {
40 string? opening_bracket = match.fetch_named ("OPENING_BRACKET");
41 string? html_tag_name = match.fetch_named ("TAG_NAME").down ();
42 string? closing_bracket = match.fetch_named ("CLOSING_BRACKET");
43
44 if (html_tag_name == "br")
45 {
46 result.append ("\n");
47 }
48 else if (html_tag_name == "b" ||
49 html_tag_name == "i" ||
50 html_tag_name == "u" ||
51 html_tag_name == "s" ||
52 html_tag_name == "tt" ||
53 html_tag_name == "big" ||
54 html_tag_name == "small" ||
55 html_tag_name == "sup" ||
56 html_tag_name == "sub")
57 {
58 result.append (entire_substring);
59 }
60 else if (html_tag_name == "strike")
61 {
62 result.append (opening_bracket + "s" + closing_bracket);
63 }
64 }
65 else if (match.get_match_count () == 6)
66 {
67 string entity_name = match.fetch_named ("ENTITY_NAME").down ();
68 if (entity_name == "amp" ||
69 entity_name == "quot" ||
70 entity_name == "apos" ||
71 entity_name == "lt" ||
72 entity_name == "gt")
73 {
74 result.append (match.fetch_named ("ENTITY").down ());
75 }
76 else if (entity_name == "nbsp")
77 {
78 result.append (" ");
79 }
80 else //append raw entity name with &amp; prefix
40 {81 {
41 result.append ("&amp;");82 result.append ("&amp;");
42 }83 result.append (entity_name);
43 else /* html <..> tags */84 }
44 {85 }
45 string? html_tag_name = match.fetch (1).down ();86 else if (match.get_match_count () == 7 && match.fetch_named ("AMPERSAND") == "&")
46 if (html_tag_name == "br")87 {
47 {88 result.append ("&amp;");
48 result.append ("\n");89 }
49 }90 /* else - ignore all other html tags */
50 else if (html_tag_name == "b")
51 {
52 result.append (entire_substring);
53 }
54 }
55 /* else - ignore all other html tags */
56 }
57 return false;91 return false;
58 }92 }
59 93
6094
=== added directory 'tests/unit'
=== added file 'tests/unit/Makefile.am'
--- tests/unit/Makefile.am 1970-01-01 00:00:00 +0000
+++ tests/unit/Makefile.am 2012-10-01 15:11:29 +0000
@@ -0,0 +1,28 @@
1check_PROGRAMS = test-markup-cleaner
2TESTS = $(check_PROGRAMS)
3
4test_markup_cleaner_SOURCES = \
5 test-markup-cleaner.vala \
6 $(top_srcdir)/src/markup-cleaner.vala
7 $(NULL)
8
9test_markup_cleaner_CPPFLAGS = \
10 -w \
11 $(LENS_DAEMON_CFLAGS) \
12 -I$(srcdir) \
13 $(NULL)
14
15AM_VALAFLAGS = \
16 -C \
17 --vapidir $(top_srcdir)/vapi \
18 --pkg glib-2.0 \
19 --pkg gio-2.0 \
20 --pkg gio-unix-2.0 \
21 --target-glib=2.26 \
22 $(NULL)
23
24test_markup_cleaner_LDADD = \
25 $(LENS_DAEMON_LIBS) \
26 $(test_libs)
27
28CLEANFILES = *.stamp
029
=== added file 'tests/unit/test-markup-cleaner.vala'
--- tests/unit/test-markup-cleaner.vala 1970-01-01 00:00:00 +0000
+++ tests/unit/test-markup-cleaner.vala 2012-10-01 15:11:29 +0000
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) 2012 Canonical Ltd
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 3 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 *
16 * Authored by Pawel Stolowski <pawel.stolowski@canonical.com>
17 *
18 */
19
20using Unity.ShoppingLens;
21
22public class Main
23{
24 public static int main (string[] args)
25 {
26 Test.init (ref args);
27
28 Test.add_data_func ("/Unit/NoMarkup", test_no_markup);
29 Test.add_data_func ("/Unit/BrTagSupport", test_br_tag_support);
30 Test.add_data_func ("/Unit/BTagSupport", test_b_tag_support);
31 Test.add_data_func ("/Unit/ITagSupport", test_i_tag_support);
32 Test.add_data_func ("/Unit/UTagSupport", test_u_tag_support);
33 Test.add_data_func ("/Unit/TtTagSupport", test_tt_tag_support);
34 Test.add_data_func ("/Unit/STagSupport", test_s_tag_support);
35 Test.add_data_func ("/Unit/StrikeTagSupport", test_strike_tag_support);
36 Test.add_data_func ("/Unit/BigTagSupport", test_big_tag_support);
37 Test.add_data_func ("/Unit/SmallTagSupport", test_small_tag_support);
38 Test.add_data_func ("/Unit/SubTagSupport", test_sub_tag_support);
39 Test.add_data_func ("/Unit/SupTagSupport", test_sup_tag_support);
40 Test.add_data_func ("/Unit/UnsupportedTags", test_unsupported_tags);
41 Test.add_data_func ("/Unit/NestedTags", test_nested_tags);
42 Test.add_data_func ("/Unit/AmpEntitySupport", test_amp_entity);
43 Test.add_data_func ("/Unit/NbspEntitySupport", test_nbsp_entity);
44 Test.add_data_func ("/Unit/EntitiesArePreserved", test_basic_entities_are_preserved);
45 Test.add_data_func ("/Unit/UnsupportedEntitiesAreRaw", test_unsupported_entities_are_raw);
46 Test.add_data_func ("/Unit/NumericEntitiesArePreserved", test_num_entities_are_preserved);
47
48 Test.run ();
49 return 0;
50 }
51
52 internal static void test_no_markup ()
53 {
54 string input = "This is a\ntest";
55 string result = MarkupCleaner.html_to_pango_markup (input);
56 assert (result == "This is a\ntest");
57 }
58
59 internal static void test_br_tag_support ()
60 {
61 string input = "This is<br/> a<br />test<br>";
62 string result = MarkupCleaner.html_to_pango_markup (input);
63 assert (result == "This is\n a\ntest\n");
64 }
65
66 internal static void test_b_tag_support ()
67 {
68 string input = "<B>T</b>his <b>is</b> a <B>test</B>";
69 string result = MarkupCleaner.html_to_pango_markup (input);
70 assert (result == "<b>T</b>his <b>is</b> a <b>test</b>");
71 }
72
73 internal static void test_i_tag_support ()
74 {
75 string input = "<I>T</i>his <i>is</i> a <I>test</I>";
76 string result = MarkupCleaner.html_to_pango_markup (input);
77 assert (result == "<i>T</i>his <i>is</i> a <i>test</i>");
78 }
79
80 internal static void test_u_tag_support ()
81 {
82 string input = "<U>T</u>his <u>is</u> a <U>test</U>";
83 string result = MarkupCleaner.html_to_pango_markup (input);
84 assert (result == "<u>T</u>his <u>is</u> a <u>test</u>");
85 }
86
87 internal static void test_tt_tag_support ()
88 {
89 string input = "<TT>T</TT>his <tt>is</tt> a <tT>test</Tt>";
90 string result = MarkupCleaner.html_to_pango_markup (input);
91 assert (result == "<tt>T</tt>his <tt>is</tt> a <tt>test</tt>");
92 }
93
94 internal static void test_s_tag_support ()
95 {
96 string input = "<S>T</s>his <s>is</s> a <S>test</S>";
97 string result = MarkupCleaner.html_to_pango_markup (input);
98 assert (result == "<s>T</s>his <s>is</s> a <s>test</s>");
99 }
100
101 internal static void test_strike_tag_support ()
102 {
103 string input = "<STRIKE>T</STRIKE>his <strike>is</strike> a <STRike>test</Strike>";
104 string result = MarkupCleaner.html_to_pango_markup (input);
105 assert (result == "<s>T</s>his <s>is</s> a <s>test</s>");
106 }
107
108 internal static void test_small_tag_support ()
109 {
110 string input = "<SMALL>T</SMALL>his <small>is</small> a <SMall>test</SmaLL>";
111 string result = MarkupCleaner.html_to_pango_markup (input);
112 assert (result == "<small>T</small>his <small>is</small> a <small>test</small>");
113 }
114
115 internal static void test_big_tag_support ()
116 {
117 string input = "<BIG>T></BIG>his <big>is</big> a <bIG>test</BiG>";
118 string result = MarkupCleaner.html_to_pango_markup (input);
119 assert (result == "<big>T></big>his <big>is</big> a <big>test</big>");
120 }
121
122 internal static void test_sub_tag_support ()
123 {
124 string input = "<SUB>T</SUB>his <sub>is</sub> a <suB>test</SuB>";
125 string result = MarkupCleaner.html_to_pango_markup (input);
126 assert (result == "<sub>T</sub>his <sub>is</sub> a <sub>test</sub>");
127 }
128
129 internal static void test_sup_tag_support ()
130 {
131 string input = "<SUP>T</SUP>his <sup>is</sup> a <suP>test</SuP>";
132 string result = MarkupCleaner.html_to_pango_markup (input);
133 assert (result == "<sup>T</sup>his <sup>is</sup> a <sup>test</sup>");
134 }
135
136 internal static void test_unsupported_tags ()
137 {
138 string input = "<foo>This</bar> is a <a href=\"http://foo.com\">test</a>";
139 string result = MarkupCleaner.html_to_pango_markup (input);
140 assert (result == "This is a test");
141 }
142
143 internal static void test_nested_tags ()
144 {
145 string input = "<a href=\"wooo\"><small>Click me!</small></a>";
146 string result = MarkupCleaner.html_to_pango_markup (input);
147 assert (result == "<small>Click me!</small>");
148 }
149
150 internal static void test_amp_entity ()
151 {
152 string input = "Foo & Bar";
153 string result = MarkupCleaner.html_to_pango_markup (input);
154 assert (result == "Foo &amp; Bar");
155 }
156
157 internal static void test_nbsp_entity ()
158 {
159 string input = "Foo&nbsp;Bar";
160 string result = MarkupCleaner.html_to_pango_markup (input);
161 assert (result == "Foo Bar");
162 }
163
164 internal static void test_basic_entities_are_preserved ()
165 {
166 string input = "Foo &amp; bar &lt; &GT; &Quot; &apos;";
167 string result = MarkupCleaner.html_to_pango_markup (input);
168 assert (result == "Foo &amp; bar &lt; &gt; &quot; &apos;");
169 }
170
171 internal static void test_unsupported_entities_are_raw ()
172 {
173 string input = "Foo &frac14; bar &not;";
174 string result = MarkupCleaner.html_to_pango_markup (input);
175 assert (result == "Foo &amp;frac14 bar &amp;not");
176 }
177
178 internal static void test_num_entities_are_preserved ()
179 {
180 string input = "Foo&#160;bar &#8364;";
181 string result = MarkupCleaner.html_to_pango_markup (input);
182 assert (result == "Foo&#160;bar &#8364;");
183 }
184}

Subscribers

People subscribed via source and target branches