From d134556cf4606cd70926cdfee843345487beaa79 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 25 Feb 2026 22:30:16 +0100 Subject: [PATCH 1/6] Add missing Python 3.14 to classifiers --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index fffc43a..549f17a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ classifiers = Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 [options] packages = From 5d48ba60c8910ddeb7966fdee5144eade5a87d19 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 25 Feb 2026 22:35:58 +0100 Subject: [PATCH 2/6] Implement unicode escape decoding Unicode escapes in CSS were not properly decoded before security checks. This prevents attackers from bypassing filters using escape sequences. --- CHANGES.rst | 7 ++++++ lxml_html_clean/clean.py | 22 +++++++++++++++++- tests/test_clean.py | 48 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index a3dbfac..51d32d3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,13 @@ lxml_html_clean changelog Unreleased ========== +Bugs fixed +---------- + +* Fixed a bug where Unicode escapes in CSS were not properly decoded + before security checks. This prevents attackers from bypassing filters + using escape sequences. + 0.4.3 (2025-10-02) ================== diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 3eeda47..5424d9f 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -578,6 +578,26 @@ def _remove_javascript_link(self, link): _comments_re = re.compile(r'/\*.*?\*/', re.S) _find_comments = _comments_re.finditer _substitute_comments = _comments_re.sub + _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?') + + def _decode_css_unicode_escapes(self, style): + """ + Decode CSS Unicode escape sequences like \\69 or \\000069 to their + actual character values. This prevents bypassing security checks + using CSS escape sequences. + + CSS escape syntax: backslash followed by 1-6 hex digits, + optionally followed by a whitespace character. + """ + def replace_escape(match): + hex_value = match.group(1) + try: + return chr(int(hex_value, 16)) + except (ValueError, OverflowError): + # Invalid unicode codepoint, keep original + return match.group(0) + + return self._css_unicode_escape_re.sub(replace_escape, style) def _has_sneaky_javascript(self, style): """ @@ -591,7 +611,7 @@ def _has_sneaky_javascript(self, style): more sneaky attempts. """ style = self._substitute_comments('', style) - style = style.replace('\\', '') + style = self._decode_css_unicode_escapes(style) style = _substitute_whitespace('', style) style = style.lower() if _has_javascript_scheme(style): diff --git a/tests/test_clean.py b/tests/test_clean.py index 64ad52d..d1ebcb1 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -393,3 +393,51 @@ def test_possibly_invalid_url_without_whitelist(self): self.assertEqual(len(w), 0) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) + + def test_unicode_escape_in_style(self): + # Test that CSS Unicode escapes are properly decoded before security checks + # This prevents attackers from bypassing filters using escape sequences + # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits) + + # Test inline style attributes (requires safe_attrs_only=False) + cleaner = Cleaner(safe_attrs_only=False) + inline_style_cases = [ + # \6a\61\76\61\73\63\72\69\70\74 = "javascript" + ('
test
', '
test
'), + # \69 = 'i', so \69mport = "import" + ('
test
', '
test
'), + # \69 with space after = 'i', space consumed as part of escape + ('
test
', '
test
'), + # \65\78\70\72\65\73\73\69\6f\6e = "expression" + ('
test
', '
test
'), + ] + + for html, expected in inline_style_cases: + with self.subTest(html=html): + cleaned = cleaner.clean_html(html) + self.assertEqual(expected, cleaned) + + # Test ', + # Unicode-escaped "javascript:" without url() + '', + # Unicode-escaped "expression" + '', + # Unicode-escaped @import with 'i' + '', + # Unicode-escaped "data:" scheme + '', + # Space after escape is consumed: \69 mport = "import" + '', + # 6-digit escape: \000069 = 'i' + '', + # 6-digit escape with space + '', + ] + + for html in style_tag_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned) From 5121609c1dbcb28763510ab229404e2d91ebc89c Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 25 Feb 2026 22:57:28 +0100 Subject: [PATCH 3/6] Remove tags to prevent URL hijacking attacks tags are now automatically removed whenever is removed to prevent URL hijacking attacks. According to HTML spec, must be in , but browsers may interpret misplaced tags, allowing attackers to redirect all relative URLs to malicious servers. --- CHANGES.rst | 5 +++++ lxml_html_clean/clean.py | 6 +++++ tests/test_clean.py | 48 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 51d32d3..356590d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,11 @@ Bugs fixed * Fixed a bug where Unicode escapes in CSS were not properly decoded before security checks. This prevents attackers from bypassing filters using escape sequences. +* Fixed a security issue where ```` tags could be used for URL + hijacking attacks. The ```` tag is now automatically removed + whenever the ```` tag is removed (via ``page_structure=True`` + or manual configuration), as ```` must be inside ```` + according to HTML specifications. 0.4.3 (2025-10-02) ================== diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 5424d9f..6f95b26 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -422,6 +422,12 @@ def __call__(self, doc): if self.annoying_tags: remove_tags.update(('blink', 'marquee')) + # Remove tags whenever is being removed. + # According to HTML spec, must be in , but browsers + # may interpret it even when misplaced, allowing URL hijacking attacks. + if 'head' in kill_tags or 'head' in remove_tags: + kill_tags.add('base') + _remove = deque() _kill = deque() for el in doc.iter(): diff --git a/tests/test_clean.py b/tests/test_clean.py index d1ebcb1..93f6da1 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -394,6 +394,54 @@ def test_possibly_invalid_url_without_whitelist(self): self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) + def test_base_tag_removed_with_page_structure(self): + # Test that tags are removed when page_structure=True (default) + # This prevents URL hijacking attacks where redirects all relative URLs + + test_cases = [ + # in proper location (inside ) + 'link', + # outside + '
link
', + # Multiple tags + '
', + # with target attribute + '
content
', + # at various positions + 'test', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + # Verify tag is completely removed + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + self.assertNotIn('evil2.com', cleaned) + + def test_base_tag_kept_when_page_structure_false(self): + # When page_structure=False and head is not removed, should be kept + cleaner = Cleaner(page_structure=False) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertIn('', cleaned) + + def test_base_tag_removed_when_head_in_remove_tags(self): + # Even with page_structure=False, should be removed if head is manually removed + cleaner = Cleaner(page_structure=False, remove_tags=['head']) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + + def test_base_tag_removed_when_head_in_kill_tags(self): + # Even with page_structure=False, should be removed if head is in kill_tags + cleaner = Cleaner(page_structure=False, kill_tags=['head']) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + def test_unicode_escape_in_style(self): # Test that CSS Unicode escapes are properly decoded before security checks # This prevents attackers from bypassing filters using escape sequences From c9b82ba5e61a135af4fc7e5f7ca2a526487bd198 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 09:52:51 +0100 Subject: [PATCH 4/6] Prepare release 0.4.4 --- CHANGES.rst | 3 +++ setup.cfg | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 356590d..5deb91e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,9 @@ lxml_html_clean changelog Unreleased ========== +0.4.4 (2026-02-26) +================== + Bugs fixed ---------- diff --git a/setup.cfg b/setup.cfg index 549f17a..b8281ec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = lxml_html_clean -version = 0.4.3 +version = 0.4.4 description = HTML cleaner from lxml project long_description = file:README.md long_description_content_type = text/markdown From 67e029fc22168b2acbbd6ab26abef7ab1e6044fc Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 13:56:00 +0100 Subject: [PATCH 5/6] Restore the removal of all backslashes from styles after decoding of unicode escapes --- lxml_html_clean/clean.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 6f95b26..71f2c75 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -618,6 +618,7 @@ def _has_sneaky_javascript(self, style): """ style = self._substitute_comments('', style) style = self._decode_css_unicode_escapes(style) + style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() if _has_javascript_scheme(style): From 8620e3cd1ce15218c89d76a73ae1534a7b0ca94d Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 16:50:59 +0100 Subject: [PATCH 6/6] Add more tests for different combinations of backslashes and unicode --- tests/test_clean.py | 96 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tests/test_clean.py b/tests/test_clean.py index 93f6da1..547ede8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -489,3 +489,99 @@ def test_unicode_escape_in_style(self): with self.subTest(html=html): cleaned = clean_html(html) self.assertEqual('
', cleaned) + + def test_unicode_escape_mixed_with_comments(self): + # Unicode escapes mixed with CSS comments should still be caught + test_cases = [ + # \69 = 'i' with comment before + '', + # \69 = 'i' with comment after + '', + # Multiple escapes with comments + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned) + + def test_unicode_escape_case_insensitive(self): + # CSS hex escapes should work with both uppercase and lowercase hex digits + # \69 = 'i', \6D = 'm', etc. + test_cases = [ + # @import with uppercase hex digits: \69\6D\70\6F\72\74 + '', + # @import with some uppercase + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned) + + def test_unicode_escape_various_schemes(self): + # Test Unicode escapes for various malicious schemes + test_cases = [ + # \76\62\73\63\72\69\70\74 = "vbscript" + '', + # \6a\73\63\72\69\70\74 = "jscript" + '', + # \6c\69\76\65\73\63\72\69\70\74 = "livescript" + '', + # \6d\6f\63\68\61 = "mocha" + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned) + + def test_unicode_escape_with_whitespace_variations(self): + # Test different whitespace characters after Unicode escapes + cleaner = Cleaner(safe_attrs_only=False) + test_cases = [ + # Tab after escape + ('
test
', '
test
'), + # Newline after escape (note: actual newline, not \n) + ('
test
', '
test
'), + # Form feed after escape + ('
test
', '
test
'), + ] + + for html, expected in test_cases: + with self.subTest(html=html): + cleaned = cleaner.clean_html(html) + self.assertEqual(expected, cleaned) + + def test_backslash_removal_after_unicode_decode(self): + # After decoding Unicode escapes, remaining backslashes are removed + # This ensures double-obfuscation (unicode + backslashes) is caught + test_cases = [ + # Step 1: \69 → 'i', Step 2: remove \, Result: @import + '', + # Multiple unicode escapes with backslashes mixed in + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned) + + def test_backslash_obfuscation_without_unicode(self): + # Test that patterns using ONLY backslash obfuscation (no unicode) are caught + # Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern + test_cases = [ + # @\i\m\p\o\r\t → @import (caught by '@import' check) + '', + # Can also test combinations that create javascript schemes + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('
', cleaned)