Merge ~facelessuser/beautifulsoup:feature/enhanced-soupsieve into beautifulsoup:master

Proposed by Isaac Muse
Status: Needs review
Proposed branch: ~facelessuser/beautifulsoup:feature/enhanced-soupsieve
Merge into: beautifulsoup:master
Diff against target: 243 lines (+160/-6)
3 files modified
bs4/__init__.py (+15/-0)
bs4/element.py (+68/-1)
bs4/tests/test_pageelement.py (+77/-5)
Reviewer Review Type Date Requested Status
Leonard Richardson Pending
Review via email: mp+436496@code.launchpad.net

Commit message

Enhanced support for soupsieve

Add `sieve` method to allow access to all CSS query methods.
Add `css_escape` as a global method.

Description of the change

Add access to soupsieve API for queries via `sieve`. An alternative could be to use the name `css` if desired.

Add `css_escape` as a global method to escape CSS identifiers.

To post a comment you must log in.

Unmerged commits

36cd05d... by Isaac Muse

Enhanced support for soupsieve

Add `sieve` method to allow access to all CSS query methods.
Add `css_escape` as a global method.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/bs4/__init__.py b/bs4/__init__.py
2index db71cc7..226e7dd 100644
3--- a/bs4/__init__.py
4+++ b/bs4/__init__.py
5@@ -56,8 +56,23 @@ from .element import (
6 SoupStrainer,
7 Tag,
8 TemplateString,
9+ soupsieve
10 )
11
12+
13+def css_escape(identifier):
14+ """Escape a CSS identifier.
15+
16+ :param identifier: A string to be treated as a CSS identifier.
17+ Any characters unsupported characters will be escaped.
18+
19+ :return: An escaped CSS identifier string.
20+ :rtype: str
21+ """
22+
23+ return soupsieve.escape(identifier)
24+
25+
26 # Define some custom warnings.
27 class GuessedAtParserWarning(UserWarning):
28 """The warning issued when BeautifulSoup has to guess what parser to
29diff --git a/bs4/element.py b/bs4/element.py
30index 583d0e8..7ae054e 100644
31--- a/bs4/element.py
32+++ b/bs4/element.py
33@@ -8,6 +8,7 @@ except ImportError as e:
34 import re
35 import sys
36 import warnings
37+import functools
38 try:
39 import soupsieve
40 except ImportError as e:
41@@ -69,7 +70,52 @@ PYTHON_SPECIFIC_ENCODINGS = set([
42 "string-escape",
43 "string_escape",
44 ])
45-
46+
47+
48+class Sieve:
49+ """An object scoped to a given document/element that provides CSS selecting/matching.
50+
51+ A wrapper around the SoupSieve object which scopes it to the given tag and namespaces.
52+ """
53+
54+ def __init__(self, tag, sieve):
55+ """Initialized the object with the scoped element.
56+
57+ :param tag: The element tag that queries will be scoped to.
58+ :param sieve: The compiled SoupSieve object.
59+ """
60+
61+ self._tag = tag
62+ self._pattern = sieve
63+
64+ @property
65+ def pattern(self):
66+ """Return the pattern (SoupSieve object)."""
67+
68+ return self._pattern
69+
70+ def __getattr__(self, __name):
71+ """Get attribute of the SoupSieve object and properly return it.
72+
73+ :param __name: The name of the attribute to be retrieved. If the
74+ attribute is present on the SoupSieve object, it will be returned
75+ from there; otherwise, it will be retrieved from the Sieve object.
76+
77+ :return: The desired atribute: function, property, etc.
78+ """
79+
80+ try:
81+ # Attempt to retrieve a Sieve attribute.
82+ super().__getattribute__(__name)
83+ except AttributeError:
84+ pass
85+
86+ # We can assume the attribute must be in the SoupSieve object as it isn't present in Sieve.
87+ attr = getattr(self.pattern, __name)
88+ if callable(attr):
89+ return lambda *args, __tag=self._tag, __attr=attr, **kwargs: attr(__tag, *args, **kwargs)
90+ return attr
91+
92
93 class NamespacedAttribute(str):
94 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
95@@ -1995,6 +2041,27 @@ class Tag(PageElement):
96 # ResultSet.__getattr__ has a helpful error message.
97 return ResultSet(None, results)
98
99+ def sieve(self, pattern, namespaces=None, flags=0, *, custom=None, **kwargs):
100+ """Access related Soup Sieve methods.
101+
102+ :param pattern: A string or compiled SoupSieve object that describes the
103+ desired element to select or match.
104+ :param namespaces: An optional mapping containing namespaces. If not provided,
105+ BeautifulSoup will default them to what it discovered.
106+ :param flags: Integer input specifying any and all flags to enable.
107+ :param custom: Dictionary describing custom selectors.
108+ :param **kwargs: Additional keyword arguments to be passed into SoupSieve's methods.
109+
110+ :return: A Sieve object which acts as a scoped SoupSieve object.
111+ :rtype: bs4.element.Sieve
112+ """
113+
114+ # Set the namespace if we have an uncompiled object.
115+ if namespaces is None and not isinstance(pattern, soupsieve.css_match.SoupSieve):
116+ namespaces = self._namespaces
117+
118+ return Sieve(self, soupsieve.compile(pattern, namespaces, flags=flags, custom=custom, **kwargs))
119+
120 # Old names for backwards compatibility
121 def childGenerator(self):
122 """Deprecated generator."""
123diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
124index 75bab04..ca9abd1 100644
125--- a/bs4/tests/test_pageelement.py
126+++ b/bs4/tests/test_pageelement.py
127@@ -4,11 +4,15 @@ import pickle
128 import pytest
129
130 from soupsieve import SelectorSyntaxError
131+from soupsieve.css_match import SoupSieve
132
133-from bs4 import BeautifulSoup
134+from bs4 import (
135+ BeautifulSoup,
136+ css_escape
137+)
138 from bs4.element import (
139 Comment,
140- SoupStrainer,
141+ SoupStrainer
142 )
143 from . import SoupTest
144
145@@ -306,7 +310,6 @@ class TestCSSSelectors(SoupTest):
146 match = self.soup.select_one('nonexistenttag')
147 assert None == match
148
149-
150 def test_tag_in_tag_one(self):
151 els = self.soup.select('div div')
152 self.assert_selects('div div', ['inner', 'data1'])
153@@ -315,7 +318,6 @@ class TestCSSSelectors(SoupTest):
154 for selector in ('html div', 'html body div', 'body div'):
155 self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
156
157-
158 def test_limit(self):
159 self.assert_selects('html div', ['main'], limit=1)
160 self.assert_selects('html body div', ['inner', 'main'], limit=2)
161@@ -634,6 +636,77 @@ class TestCSSSelectors(SoupTest):
162 for element in soup.find_all(class_=['c1', 'c2']):
163 assert element in selected
164
165+ def test_sieve(self):
166+ """Test sieve property."""
167+
168+ assert isinstance(self.soup.sieve('div').pattern, SoupSieve)
169+
170+ def test_sieve_select(self):
171+ """Test `select`."""
172+
173+ els = self.soup.sieve('div#inner p').select()
174+ assert len(els) == 3
175+
176+ def test_sieve_select_args(self):
177+ """Test `select` with arguments."""
178+
179+ els = self.soup.sieve('div#inner p').select(1)
180+ assert len(els) == 1
181+
182+ def test_sieve_select_one(self):
183+ """Test `select_one`."""
184+
185+ el = self.soup.sieve('div#inner p').select_one()
186+ assert el.name == 'p'
187+
188+ def test_sieve_iselect(self):
189+ """Test `iselect`."""
190+
191+ els = self.soup.sieve('div#inner p').iselect()
192+ # Should be a generator
193+ assert not isinstance(els, list)
194+ els = list(els)
195+ assert len(els) == 3
196+
197+ def test_sieve_match(self):
198+ """Test `match`."""
199+
200+ els = self.soup.sieve('div#inner p').select(1)
201+ assert els[0].sieve('div#inner p').match()
202+
203+ def test_sieve_closest(self):
204+ """Test `closest`."""
205+
206+ els = self.soup.sieve('div#inner p').select(1)
207+ el = els[0].sieve('div').closest()
208+ assert el.sieve('div#inner').match()
209+
210+ def test_sieve_filter(self):
211+ """Test `filter`."""
212+
213+ # Implicit wildcard match of en, en-gb, and en-us of the direct parent div#main
214+ els = self.soup.sieve('div#main > :lang("en")').select()
215+ parent = els[0].parent
216+ # Filter the direct descendants that match the given pattern.
217+ els2 = parent.sieve(':lang("en")').filter()
218+ assert len(els) == len(els2) == 3
219+
220+ def test_reuse_precompiled_pattern(self):
221+ """Test that we can reuse a precompiled pattern."""
222+
223+ pattern = self.soup.sieve('div#main > :lang("en")').pattern
224+ els = self.soup.sieve(pattern).select()
225+ assert len(els) == 3
226+ assert all([e.sieve(pattern).match() for e in els])
227+
228+ def test_css_escape(self):
229+ """Test CSS escape."""
230+
231+ # Identifiers in CSS cannot start with numbers without escaping
232+ html = """<div class="1test">test</div>"""
233+ soup = BeautifulSoup(html, 'html.parser')
234+ assert soup.sieve('.' + css_escape('1test')).select_one()['class'][0] == '1test'
235+
236
237 class TestPersistence(SoupTest):
238 "Testing features like pickle and deepcopy."
239@@ -749,4 +822,3 @@ class TestPersistence(SoupTest):
240 assert None == div_copy.previous_element
241 assert None == div_copy.find(string='Bar').next_element
242 assert None != div.find(string='Bar').next_element
243-

Subscribers

People subscribed via source and target branches

to all changes: