Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ lxml_html_clean changelog
Unreleased
==========

0.4.4 (2026-02-26)
==================

Bugs fixed
----------

* Fixed a bug where Unicode escapes in CSS were not properly decoded
before security checks. This prevents attackers from bypassing filters
using escape sequences.
* Fixed a security issue where ``<base>`` tags could be used for URL
hijacking attacks. The ``<base>`` tag is now automatically removed
whenever the ``<head>`` tag is removed (via ``page_structure=True``
or manual configuration), as ``<base>`` must be inside ``<head>``
according to HTML specifications.

0.4.3 (2025-10-02)
==================

Expand Down
27 changes: 27 additions & 0 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ def __call__(self, doc):
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))

# Remove <base> tags whenever <head> is being removed.
# According to HTML spec, <base> must be in <head>, but browsers
# may interpret it even when misplaced, allowing URL hijacking attacks.
if 'head' in kill_tags or 'head' in remove_tags:
kill_tags.add('base')

_remove = deque()
_kill = deque()
for el in doc.iter():
Expand Down Expand Up @@ -578,6 +584,26 @@ def _remove_javascript_link(self, link):
_comments_re = re.compile(r'/\*.*?\*/', re.S)
_find_comments = _comments_re.finditer
_substitute_comments = _comments_re.sub
_css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')

def _decode_css_unicode_escapes(self, style):
"""
Decode CSS Unicode escape sequences like \\69 or \\000069 to their
actual character values. This prevents bypassing security checks
using CSS escape sequences.

CSS escape syntax: backslash followed by 1-6 hex digits,
optionally followed by a whitespace character.
"""
def replace_escape(match):
hex_value = match.group(1)
try:
return chr(int(hex_value, 16))
except (ValueError, OverflowError):
# Invalid unicode codepoint, keep original
return match.group(0)

return self._css_unicode_escape_re.sub(replace_escape, style)

def _has_sneaky_javascript(self, style):
"""
Expand All @@ -591,6 +617,7 @@ def _has_sneaky_javascript(self, style):
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = self._decode_css_unicode_escapes(style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = lxml_html_clean
version = 0.4.3
version = 0.4.4
description = HTML cleaner from lxml project
long_description = file:README.md
long_description_content_type = text/markdown
Expand All @@ -19,6 +19,7 @@ classifiers =
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Programming Language :: Python :: 3.13
Programming Language :: Python :: 3.14

[options]
packages =
Expand Down
192 changes: 192 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,195 @@ def test_possibly_invalid_url_without_whitelist(self):
self.assertEqual(len(w), 0)
self.assertNotIn("google.com", result)
self.assertNotIn("example.com", result)

def test_base_tag_removed_with_page_structure(self):
# Test that <base> tags are removed when page_structure=True (default)
# This prevents URL hijacking attacks where <base> redirects all relative URLs

test_cases = [
# <base> in proper location (inside <head>)
'<html><head><base href="http://evil.com/"></head><body><a href="page.html">link</a></body></html>',
# <base> outside <head>
'<div><base href="http://evil.com/"><a href="page.html">link</a></div>',
# Multiple <base> tags
'<base href="http://evil.com/"><div><base href="http://evil2.com/"></div>',
# <base> with target attribute
'<base target="_blank"><div>content</div>',
# <base> at various positions
'<html><base href="http://evil.com/"><body>test</body></html>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
# Verify <base> tag is completely removed
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)
self.assertNotIn('evil2.com', cleaned)

def test_base_tag_kept_when_page_structure_false(self):
# When page_structure=False and head is not removed, <base> should be kept
cleaner = Cleaner(page_structure=False)
html = '<html><head><base href="http://example.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertIn('<base href="http://example.com/">', cleaned)

def test_base_tag_removed_when_head_in_remove_tags(self):
# Even with page_structure=False, <base> should be removed if head is manually removed
cleaner = Cleaner(page_structure=False, remove_tags=['head'])
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)

def test_base_tag_removed_when_head_in_kill_tags(self):
# Even with page_structure=False, <base> should be removed if head is in kill_tags
cleaner = Cleaner(page_structure=False, kill_tags=['head'])
html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
cleaned = cleaner.clean_html(html)
self.assertNotIn('base', cleaned.lower())
self.assertNotIn('evil.com', cleaned)

def test_unicode_escape_in_style(self):
# Test that CSS Unicode escapes are properly decoded before security checks
# This prevents attackers from bypassing filters using escape sequences
# CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)

# Test inline style attributes (requires safe_attrs_only=False)
cleaner = Cleaner(safe_attrs_only=False)
inline_style_cases = [
# \6a\61\76\61\73\63\72\69\70\74 = "javascript"
('<div style="background: url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', '<div>test</div>'),
# \69 = 'i', so \69mport = "import"
('<div style="@\\69mport url(evil.css)">test</div>', '<div>test</div>'),
# \69 with space after = 'i', space consumed as part of escape
('<div style="@\\69 mport url(evil.css)">test</div>', '<div>test</div>'),
# \65\78\70\72\65\73\73\69\6f\6e = "expression"
('<div style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', '<div>test</div>'),
]

for html, expected in inline_style_cases:
with self.subTest(html=html):
cleaned = cleaner.clean_html(html)
self.assertEqual(expected, cleaned)

# Test <style> tag content (uses default clean_html)
style_tag_cases = [
# Unicode-escaped "javascript:" in url()
'<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
# Unicode-escaped "javascript:" without url()
'<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
# Unicode-escaped "expression"
'<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
# Unicode-escaped @import with 'i'
'<style>@\\69mport url(evil.css)</style>',
# Unicode-escaped "data:" scheme
'<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
# Space after escape is consumed: \69 mport = "import"
'<style>@\\69 mport url(evil.css)</style>',
# 6-digit escape: \000069 = 'i'
'<style>@\\000069mport url(evil.css)</style>',
# 6-digit escape with space
'<style>@\\000069 mport url(evil.css)</style>',
]

for html in style_tag_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

def test_unicode_escape_mixed_with_comments(self):
# Unicode escapes mixed with CSS comments should still be caught
test_cases = [
# \69 = 'i' with comment before
'<style>@/*comment*/\\69mport url(evil.css)</style>',
# \69 = 'i' with comment after
'<style>@\\69mport/*comment*/ url(evil.css)</style>',
# Multiple escapes with comments
'<style>\\65\\78/*comment*/\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

def test_unicode_escape_case_insensitive(self):
# CSS hex escapes should work with both uppercase and lowercase hex digits
# \69 = 'i', \6D = 'm', etc.
test_cases = [
# @import with uppercase hex digits: \69\6D\70\6F\72\74
'<style>@\\69\\6D\\70\\6F\\72\\74 url(evil.css)</style>',
# @import with some uppercase
'<style>@\\69\\6D\\70\\6f\\72\\74 url(evil.css)</style>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

def test_unicode_escape_various_schemes(self):
# Test Unicode escapes for various malicious schemes
test_cases = [
# \76\62\73\63\72\69\70\74 = "vbscript"
'<style>url(\\76\\62\\73\\63\\72\\69\\70\\74:alert(1))</style>',
# \6a\73\63\72\69\70\74 = "jscript"
'<style>url(\\6a\\73\\63\\72\\69\\70\\74:alert(1))</style>',
# \6c\69\76\65\73\63\72\69\70\74 = "livescript"
'<style>url(\\6c\\69\\76\\65\\73\\63\\72\\69\\70\\74:alert(1))</style>',
# \6d\6f\63\68\61 = "mocha"
'<style>url(\\6d\\6f\\63\\68\\61:alert(1))</style>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

def test_unicode_escape_with_whitespace_variations(self):
# Test different whitespace characters after Unicode escapes
cleaner = Cleaner(safe_attrs_only=False)
test_cases = [
# Tab after escape
('<div style="@\\69\tmport url(evil.css)">test</div>', '<div>test</div>'),
# Newline after escape (note: actual newline, not \n)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note: actual newline, not \n) - I see \n just a line below, I don't understand the comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand it like it's neither r"\n" nor "\\n".

('<div style="@\\69\nmport url(evil.css)">test</div>', '<div>test</div>'),
# Form feed after escape
('<div style="@\\69\fmport url(evil.css)">test</div>', '<div>test</div>'),
]

for html, expected in test_cases:
with self.subTest(html=html):
cleaned = cleaner.clean_html(html)
self.assertEqual(expected, cleaned)

def test_backslash_removal_after_unicode_decode(self):
# After decoding Unicode escapes, remaining backslashes are removed
# This ensures double-obfuscation (unicode + backslashes) is caught
test_cases = [
# Step 1: \69 → 'i', Step 2: remove \, Result: @import
'<style>@\\69\\m\\p\\o\\r\\t url(evil.css)</style>',
# Multiple unicode escapes with backslashes mixed in
'<style>@\\69\\6d\\p\\6f\\r\\t url(evil.css)</style>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

def test_backslash_obfuscation_without_unicode(self):
# Test that patterns using ONLY backslash obfuscation (no unicode) are caught
# Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern
test_cases = [
# @\i\m\p\o\r\t → @import (caught by '@import' check)
'<style>@\\i\\m\\p\\o\\r\\t url(evil.css)</style>',
# Can also test combinations that create javascript schemes
'<style>@\\import url(evil.css)</style>',
]

for html in test_cases:
with self.subTest(html=html):
cleaned = clean_html(html)
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)