Merge lp:~mszamot-gmail/beautifulsoup/beautifulsoup into lp:beautifulsoup

Proposed by Marcin Szamotulski on 2013-06-01
Status: Needs review
Proposed branch: lp:~mszamot-gmail/beautifulsoup/beautifulsoup
Merge into: lp:beautifulsoup
Diff against target: 316 lines (+222/-23)
2 files modified
bs4/element.py (+78/-23)
bs4/tests/test_soup.py (+144/-0)
To merge this branch: bzr merge lp:~mszamot-gmail/beautifulsoup/beautifulsoup
Reviewer Review Type Date Requested Status
Leonard Richardson 2013-06-01 Pending
Review via email: mp+166933@code.launchpad.net

Commit message

Make Tag.descendants a proper generator. Make possible to resume iteration at a given node, which is important when one replaces nodes.

Description of the change

Make Tag.descendants a proper generator. With this patch one can send back where iteration should resume. This important if one replaces nodes with Tag.replace_with() method. See the doc string for an example.

To post a comment you must log in.
305. By Marcin Szamotulski on 2013-06-01

Make descendants a proper generator. This makes possible to resume iteration
at a given node, which is important when one replaces nodes.

Leonard Richardson (leonardr) wrote :

Quick questions:

1. Can you write some tests for this? Turning your docstring example into a test would be a good start.
2. Do you want to do this for the other generators as well?

306. By Marcin Szamotulski on 2013-06-01

Add a test for Tag.descendants generator

Marcin Szamotulski (mszamot-gmail) wrote :

On 11:08 Sat 01 Jun , Leonard Richardson wrote:
> Quick questions:
>
> 1. Can you write some tests for this? Turning your docstring example into a test would be a good start.
> 2. Do you want to do this for the other generators as well?
> --
> https://code.launchpad.net/~mszamot-gmail/beautifulsoup/beautifulsoup/+merge/166933
> You are the owner of lp:~mszamot-gmail/beautifulsoup/beautifulsoup.

Yes I will include some tests and I will add this for other generators
as well. Do you agree with returning iter([]) instead of None? It
makes a little bit easier to write an iteration loop, though iter([])
does not have send attribute. I don't know however how to build an
empty generator besides:

    def iterator_func()
 raise StopIteration
 yield None
    empty_gen = iterator_func() # is an empty generator with both next and send attribute

Best regards,
Marcin Szamotulski

307. By Marcin Szamotulski on 2013-06-01

Added proper generators and tests: Tag.next_elements, Tag.previous_elements,
Tag.next_siblings, Tag.previous_siblings, Tag.parents.

Unmerged revisions

307. By Marcin Szamotulski on 2013-06-01

Added proper generators and tests: Tag.next_elements, Tag.previous_elements,
Tag.next_siblings, Tag.previous_siblings, Tag.parents.

306. By Marcin Szamotulski on 2013-06-01

Add a test for Tag.descendants generator

305. By Marcin Szamotulski on 2013-06-01

Make descendants a proper generator. This makes possible to resume iteration
at a given node, which is important when one replaces nodes.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bs4/element.py'
2--- bs4/element.py 2013-05-31 13:44:37 +0000
3+++ bs4/element.py 2013-06-01 21:29:26 +0000
4@@ -509,35 +509,45 @@
5 def next_elements(self):
6 i = self.next_element
7 while i is not None:
8- yield i
9+ n = (yield i)
10+ if n is not None:
11+ i = n
12 i = i.next_element
13
14 @property
15 def next_siblings(self):
16 i = self.next_sibling
17 while i is not None:
18- yield i
19+ n = (yield i)
20+ if n is not None:
21+ i = n
22 i = i.next_sibling
23
24 @property
25 def previous_elements(self):
26 i = self.previous_element
27 while i is not None:
28- yield i
29+ n = (yield i)
30+ if n is not None:
31+ i = n
32 i = i.previous_element
33
34 @property
35 def previous_siblings(self):
36 i = self.previous_sibling
37 while i is not None:
38- yield i
39+ n = (yield i)
40+ if n is not None:
41+ i = n
42 i = i.previous_sibling
43
44 @property
45 def parents(self):
46 i = self.parent
47 while i is not None:
48- yield i
49+ n = (yield i)
50+ if n is not None:
51+ i = n
52 i = i.parent
53
54 # Methods for supporting CSS selectors.
55@@ -810,25 +820,36 @@
56
57 By default, yields only NavigableString and CData objects. So
58 no comments, processing instructions, etc.
59+
60+ Note: strip=True returns unicode instances rather than NavigableStrings.
61 """
62- for descendant in self.descendants:
63- if (
64- (types is None and not isinstance(descendant, NavigableString))
65- or
66- (types is not None and type(descendant) not in types)):
67- continue
68- if strip:
69- descendant = descendant.strip()
70- if len(descendant) == 0:
71+ generator = self.descendants
72+ if not hasattr(generator, 'send'):
73+ raise StopIteration
74+ descendant = generator.send(None)
75+ while True:
76+ try:
77+ if (
78+ (types is None and not isinstance(descendant, NavigableString))
79+ or
80+ (types is not None and type(descendant) not in types)):
81+ descendant = generator.send(None)
82 continue
83- yield descendant
84+ if strip:
85+ descendant = descendant.strip()
86+ if len(descendant) == 0:
87+ descendant = generator.send(None)
88+ continue
89+ new = (yield descendant)
90+ descendant = generator.send(new)
91+ except StopIteration:
92+ break
93
94 strings = property(_all_strings)
95
96 @property
97 def stripped_strings(self):
98- for string in self._all_strings(True):
99- yield string
100+ return self._all_strings(True)
101
102 def get_text(self, separator=u"", strip=False,
103 types=(NavigableString, CData)):
104@@ -1176,13 +1197,47 @@
105
106 @property
107 def descendants(self):
108+ """
109+ Generator over all descendants. You can also use it to substitute
110+ nodes.
111+
112+ If you replace nodes (with :meth:`self.replace_tag`) then you should
113+ send the new node back to the generator so it knows how to go further.
114+
115+ .. python::
116+
117+ soup = BeautifulSoup(html)
118+
119+ generator = soup.descendants
120+ tag = generator.send(None)
121+ while True:
122+ ntag = soup.new_tag('div')
123+ if hasattr(tag, 'name'):
124+ ntag.append('here was: %s' % tag.name)
125+ last = ntag._last_descendant()
126+ try:
127+ if not isinstance(tag, bs4.NavigableString):
128+ tag.replace_with(ntag)
129+ # resume at the end of the ntag (this guards against
130+ # cirucral loops)
131+ tag = generator.send(last)
132+ else:
133+ tag = generator.send(None)
134+ except StopIteration:
135+ break
136+ """
137+
138 if not len(self.contents):
139- return
140- stopNode = self._last_descendant().next_element
141- current = self.contents[0]
142- while current is not stopNode:
143- yield current
144- current = current.next_element
145+ return iter([])
146+ def generator():
147+ stopNode = self._last_descendant().next_element
148+ current = self.contents[0]
149+ while current is not stopNode:
150+ new = (yield current)
151+ if new is not None:
152+ current = new
153+ current = current.next_element
154+ return generator()
155
156 # CSS selector code
157
158
159=== modified file 'bs4/tests/test_soup.py'
160--- bs4/tests/test_soup.py 2013-05-20 18:59:32 +0000
161+++ bs4/tests/test_soup.py 2013-06-01 21:29:26 +0000
162@@ -13,6 +13,7 @@
163 ContentMetaAttributeValue,
164 SoupStrainer,
165 NamespacedAttribute,
166+ NavigableString,
167 )
168 import bs4.dammit
169 from bs4.dammit import EntitySubstitution, UnicodeDammit
170@@ -381,3 +382,146 @@
171 self.assertEqual("text/html; charset=euc-jp", value)
172 self.assertEqual("text/html; charset=euc-jp", value.original_value)
173 self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
174+
175+
176+class TestSoupGenerators(SoupTest):
177+ """
178+ Todo: add tests for:
179+ Tag.next_elements
180+ Tag.next_siblings
181+ Tag.previous_elements
182+ Tag.previous_siblings
183+ Tag.parents
184+ """
185+
186+ def setUp(self):
187+
188+ self.markup = ("level0<div>level1<span>level2<b>level3</b>"
189+ "level2'</span><span>level1'</span></div>level0'")
190+ self.MARKUP = ("<it>LEVEL0</it><div><it>LEVEL1</it><span><it>LEVEL2</it><b>"
191+ "<it>LEVEL3</it></b><it>LEVEL2'</it></span><span>"
192+ "<it>LEVEL1'</it></span></div><it>LEVEL0'</it>")
193+ self.MARKUP_next_elements = ("level0<div><it>LEVEL1</it><span>"
194+ "<it>LEVEL2</it><b><it>LEVEL3</it></b>"
195+ "<it>LEVEL2'</it></span><span>"
196+ "<it>LEVEL1'</it></span></div><it>LEVEL0'</it>")
197+ self.MARKUP_previous_elements = ("<it>LEVEL0</it><div><it>LEVEL1</it>"
198+ "<span><it>LEVEL2</it><b><it>LEVEL3</it>"
199+ "</b><it>LEVEL2'</it></span><span>"
200+ "<it>LEVEL1'</it></span></div>level0'")
201+
202+ self.markup_siblings = ("<div><span>a</span><span>b<span>c</span>"
203+ "<span>d</span></span><span>e</span>"
204+ "<span>f</span></div>")
205+ self.MARKUP_next_siblings = ("<div><span>a</span><it>B</it><it>E</it>"
206+ "<it>F</it></div>")
207+ self.MARKUP_previous_siblings = ("<div><it>A</it><it>B</it><it>E</it>"
208+ "<span>f</span></div>")
209+
210+ self.markup_parents = ("<div>level0<div>level1<div>level2"
211+ "<div class=\"inner\">level3</div>level2</div>"
212+ "level1</div>level0</div>")
213+ self.MARKUP_parents = ("<SPAN>level0<SPAN>level1<SPAN>level2"
214+ "<div class=\"inner\">level3</div>level2</SPAN>"
215+ "level1</SPAN>level0</SPAN>")
216+ def _run(self, generator, soup):
217+
218+ tag = generator.send(None)
219+ while True:
220+ try:
221+ if isinstance(tag, NavigableString):
222+ new_tag = soup.new_tag('it')
223+ new_string = soup.new_string(unicode(tag).upper())
224+ new_tag.append(new_string)
225+ tag.replace_with(new_tag)
226+ last = new_tag._last_descendant()
227+ tag = generator.send(last)
228+ else:
229+ tag = generator.send(None)
230+ except StopIteration:
231+ break
232+
233+ def _run2(self, generator, soup):
234+
235+ tag = generator.send(None)
236+ while True:
237+ try:
238+ if hasattr(tag, 'name') and tag.name == 'span':
239+ new_tag = soup.new_tag('it')
240+ new_string = soup.new_string(
241+ unicode(tag.contents[0]).upper()
242+ )
243+ new_tag.append(new_string)
244+ tag.replace_with(new_tag)
245+ tag = generator.send(new_tag)
246+ else:
247+ tag = generator.send(None)
248+ except StopIteration:
249+ break
250+
251+ def test_descendants_generator(self):
252+
253+ soup = self.soup(self.markup)
254+ generator = soup.descendants
255+ self._run(generator, soup)
256+ self.assertEquals(self.MARKUP, unicode(soup))
257+
258+ def test_strings_generator(self):
259+
260+ soup = self.soup(self.markup)
261+ generator = soup.strings
262+ self._run(generator, soup)
263+ self.assertEquals(self.MARKUP, unicode(soup))
264+
265+ def test_next_elements(self):
266+
267+ soup = self.soup(self.markup)
268+ el = soup.contents[0].next_element
269+ generator = el.next_elements
270+ self._run(generator, soup)
271+ self.assertEquals(self.MARKUP_next_elements, unicode(soup))
272+
273+ def test_previous_elements(self):
274+
275+ soup = self.soup(self.markup)
276+ el = soup._last_descendant()
277+ generator = el.previous_elements
278+ self._run(generator, soup)
279+ self.assertEquals(self.MARKUP_previous_elements, unicode(soup))
280+
281+ def test_next_siblings(self):
282+
283+ soup = self.soup(self.markup_siblings)
284+ el = soup.contents[0].next_element
285+ generator = el.next_siblings
286+ self._run2(generator, soup)
287+ self.assertEquals(self.MARKUP_next_siblings, unicode(soup))
288+
289+ def test_next_siblings(self):
290+
291+ soup = self.soup(self.markup_siblings)
292+ el = soup._last_descendant().previous_element
293+ generator = el.previous_siblings
294+ self._run2(generator, soup)
295+ self.assertEquals(self.MARKUP_previous_siblings, unicode(soup))
296+
297+ def test_parents(self):
298+
299+ soup = self.soup(self.markup_parents)
300+ div = soup.select('div.inner')[0]
301+ generator = div.parents
302+
303+ tag = generator.send(None)
304+ while True:
305+ try:
306+ new_tag = soup.new_tag('SPAN')
307+ while len(list(tag.children)):
308+ new_tag.append(tag.contents[0])
309+ tag.replace_with(new_tag)
310+ tag = generator.send(new_tag)
311+ if tag.parent is None:
312+ # one cannot replace if the parent is None
313+ raise StopIteration
314+ except StopIteration:
315+ break
316+ self.assertEquals(self.MARKUP_parents, unicode(soup))

Subscribers

People subscribed via source and target branches

to status/vote changes: