From d134556cf4606cd70926cdfee843345487beaa79 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Wed, 25 Feb 2026 22:30:16 +0100
Subject: [PATCH 1/6] Add missing Python 3.14 to classifiers

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index fffc43a..549f17a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,6 +19,7 @@ classifiers =
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.12
     Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3.14
 
 [options]
 packages =

From 5d48ba60c8910ddeb7966fdee5144eade5a87d19 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Wed, 25 Feb 2026 22:35:58 +0100
Subject: [PATCH 2/6] Implement unicode escape decoding

Unicode escapes in CSS were not properly decoded before security
checks. This prevents attackers from bypassing filters using
escape sequences.
---
 CHANGES.rst              |  7 ++++++
 lxml_html_clean/clean.py | 22 +++++++++++++++++-
 tests/test_clean.py      | 48 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a3dbfac..51d32d3 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -6,6 +6,13 @@ lxml_html_clean changelog
 Unreleased
 ==========
 
+Bugs fixed
+----------
+
+* Fixed a bug where Unicode escapes in CSS were not properly decoded
+  before security checks. This prevents attackers from bypassing filters
+  using escape sequences.
+
 0.4.3 (2025-10-02)
 ==================
 
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index 3eeda47..5424d9f 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -578,6 +578,26 @@ def _remove_javascript_link(self, link):
     _comments_re = re.compile(r'/\*.*?\*/', re.S)
     _find_comments = _comments_re.finditer
     _substitute_comments = _comments_re.sub
+    _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')
+
+    def _decode_css_unicode_escapes(self, style):
+        """
+        Decode CSS Unicode escape sequences like \\69 or \\000069 to their
+        actual character values. This prevents bypassing security checks
+        using CSS escape sequences.
+
+        CSS escape syntax: backslash followed by 1-6 hex digits,
+        optionally followed by a whitespace character.
+        """
+        def replace_escape(match):
+            hex_value = match.group(1)
+            try:
+                return chr(int(hex_value, 16))
+            except (ValueError, OverflowError):
+                # Invalid unicode codepoint, keep original
+                return match.group(0)
+
+        return self._css_unicode_escape_re.sub(replace_escape, style)
 
     def _has_sneaky_javascript(self, style):
         """
@@ -591,7 +611,7 @@ def _has_sneaky_javascript(self, style):
         more sneaky attempts.
         """
         style = self._substitute_comments('', style)
-        style = style.replace('\\', '')
+        style = self._decode_css_unicode_escapes(style)
         style = _substitute_whitespace('', style)
         style = style.lower()
         if _has_javascript_scheme(style):
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 64ad52d..d1ebcb1 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -393,3 +393,51 @@ def test_possibly_invalid_url_without_whitelist(self):
             self.assertEqual(len(w), 0)
         self.assertNotIn("google.com", result)
         self.assertNotIn("example.com", result)
+
+    def test_unicode_escape_in_style(self):
+        # Test that CSS Unicode escapes are properly decoded before security checks
+        # This prevents attackers from bypassing filters using escape sequences
+        # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)
+
+        # Test inline style attributes (requires safe_attrs_only=False)
+        cleaner = Cleaner(safe_attrs_only=False)
+        inline_style_cases = [
+            # \6a\61\76\61\73\63\72\69\70\74 = "javascript"
+            ('<div style="background: url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', '<div>test</div>'),
+            # \69 = 'i', so \69mport = "import"
+            ('<div style="@\\69mport url(evil.css)">test</div>', '<div>test</div>'),
+            # \69 with space after = 'i', space consumed as part of escape
+            ('<div style="@\\69 mport url(evil.css)">test</div>', '<div>test</div>'),
+            # \65\78\70\72\65\73\73\69\6f\6e = "expression"
+            ('<div style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', '<div>test</div>'),
+        ]
+
+        for html, expected in inline_style_cases:
+            with self.subTest(html=html):
+                cleaned = cleaner.clean_html(html)
+                self.assertEqual(expected, cleaned)
+
+        # Test <style> tag content (uses default clean_html)
+        style_tag_cases = [
+            # Unicode-escaped "javascript:" in url()
+            '<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # Unicode-escaped "javascript:" without url()
+            '<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
+            # Unicode-escaped "expression"
+            '<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
+            # Unicode-escaped @import with 'i'
+            '<style>@\\69mport url(evil.css)</style>',
+            # Unicode-escaped "data:" scheme
+            '<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
+            # Space after escape is consumed: \69 mport = "import"
+            '<style>@\\69 mport url(evil.css)</style>',
+            # 6-digit escape: \000069 = 'i'
+            '<style>@\\000069mport url(evil.css)</style>',
+            # 6-digit escape with space
+            '<style>@\\000069 mport url(evil.css)</style>',
+        ]
+
+        for html in style_tag_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

From 5121609c1dbcb28763510ab229404e2d91ebc89c Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Wed, 25 Feb 2026 22:57:28 +0100
Subject: [PATCH 3/6] Remove <base> tags to prevent URL hijacking attacks

<base> tags are now automatically removed whenever <head> is removed to
prevent URL hijacking attacks. According to HTML spec, <base> must be in
<head>, but browsers may interpret misplaced <base> tags, allowing
attackers to redirect all relative URLs to malicious servers.
---
 CHANGES.rst              |  5 +++++
 lxml_html_clean/clean.py |  6 +++++
 tests/test_clean.py      | 48 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index 51d32d3..356590d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -12,6 +12,11 @@ Bugs fixed
 * Fixed a bug where Unicode escapes in CSS were not properly decoded
   before security checks. This prevents attackers from bypassing filters
   using escape sequences.
+* Fixed a security issue where ``<base>`` tags could be used for URL
+  hijacking attacks. The ``<base>`` tag is now automatically removed
+  whenever the ``<head>`` tag is removed (via ``page_structure=True``
+  or manual configuration), as ``<base>`` must be inside ``<head>``
+  according to HTML specifications.
 
 0.4.3 (2025-10-02)
 ==================
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index 5424d9f..6f95b26 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -422,6 +422,12 @@ def __call__(self, doc):
         if self.annoying_tags:
             remove_tags.update(('blink', 'marquee'))
 
+        # Remove <base> tags whenever <head> is being removed.
+        # According to HTML spec, <base> must be in <head>, but browsers
+        # may interpret it even when misplaced, allowing URL hijacking attacks.
+        if 'head' in kill_tags or 'head' in remove_tags:
+            kill_tags.add('base')
+
         _remove = deque()
         _kill = deque()
         for el in doc.iter():
diff --git a/tests/test_clean.py b/tests/test_clean.py
index d1ebcb1..93f6da1 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -394,6 +394,54 @@ def test_possibly_invalid_url_without_whitelist(self):
         self.assertNotIn("google.com", result)
         self.assertNotIn("example.com", result)
 
+    def test_base_tag_removed_with_page_structure(self):
+        # Test that <base> tags are removed when page_structure=True (default)
+        # This prevents URL hijacking attacks where <base> redirects all relative URLs
+
+        test_cases = [
+            # <base> in proper location (inside <head>)
+            '<html><head><base href="http://evil.com/"></head><body><a href="page.html">link</a></body></html>',
+            # <base> outside <head>
+            '<div><base href="http://evil.com/"><a href="page.html">link</a></div>',
+            # Multiple <base> tags
+            '<base href="http://evil.com/"><div><base href="http://evil2.com/"></div>',
+            # <base> with target attribute
+            '<base target="_blank"><div>content</div>',
+            # <base> at various positions
+            '<html><base href="http://evil.com/"><body>test</body></html>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                # Verify <base> tag is completely removed
+                self.assertNotIn('base', cleaned.lower())
+                self.assertNotIn('evil.com', cleaned)
+                self.assertNotIn('evil2.com', cleaned)
+
+    def test_base_tag_kept_when_page_structure_false(self):
+        # When page_structure=False and head is not removed, <base> should be kept
+        cleaner = Cleaner(page_structure=False)
+        html = '<html><head><base href="http://example.com/"></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertIn('<base href="http://example.com/">', cleaned)
+
+    def test_base_tag_removed_when_head_in_remove_tags(self):
+        # Even with page_structure=False, <base> should be removed if head is manually removed
+        cleaner = Cleaner(page_structure=False, remove_tags=['head'])
+        html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertNotIn('base', cleaned.lower())
+        self.assertNotIn('evil.com', cleaned)
+
+    def test_base_tag_removed_when_head_in_kill_tags(self):
+        # Even with page_structure=False, <base> should be removed if head is in kill_tags
+        cleaner = Cleaner(page_structure=False, kill_tags=['head'])
+        html = '<html><head><base href="http://evil.com/"></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertNotIn('base', cleaned.lower())
+        self.assertNotIn('evil.com', cleaned)
+
     def test_unicode_escape_in_style(self):
         # Test that CSS Unicode escapes are properly decoded before security checks
         # This prevents attackers from bypassing filters using escape sequences

From c9b82ba5e61a135af4fc7e5f7ca2a526487bd198 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Thu, 26 Feb 2026 09:52:51 +0100
Subject: [PATCH 4/6] Prepare release 0.4.4

---
 CHANGES.rst | 3 +++
 setup.cfg   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 356590d..5deb91e 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -6,6 +6,9 @@ lxml_html_clean changelog
 Unreleased
 ==========
 
+0.4.4 (2026-02-26)
+==================
+
 Bugs fixed
 ----------
 
diff --git a/setup.cfg b/setup.cfg
index 549f17a..b8281ec 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = lxml_html_clean
-version = 0.4.3
+version = 0.4.4
 description = HTML cleaner from lxml project
 long_description = file:README.md
 long_description_content_type = text/markdown

From 67e029fc22168b2acbbd6ab26abef7ab1e6044fc Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Thu, 26 Feb 2026 13:56:00 +0100
Subject: [PATCH 5/6] Restore the removal of all backslashes from styles after
 decoding of unicode escapes

---
 lxml_html_clean/clean.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index 6f95b26..71f2c75 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -618,6 +618,7 @@ def _has_sneaky_javascript(self, style):
         """
         style = self._substitute_comments('', style)
         style = self._decode_css_unicode_escapes(style)
+        style = style.replace('\\', '')
         style = _substitute_whitespace('', style)
         style = style.lower()
         if _has_javascript_scheme(style):

From 8620e3cd1ce15218c89d76a73ae1534a7b0ca94d Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Thu, 26 Feb 2026 16:50:59 +0100
Subject: [PATCH 6/6] Add more tests for different combinations of backslashes
 and unicode

---
 tests/test_clean.py | 96 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/tests/test_clean.py b/tests/test_clean.py
index 93f6da1..547ede8 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -489,3 +489,99 @@ def test_unicode_escape_in_style(self):
             with self.subTest(html=html):
                 cleaned = clean_html(html)
                 self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
+
+    def test_unicode_escape_mixed_with_comments(self):
+        # Unicode escapes mixed with CSS comments should still be caught
+        test_cases = [
+            # \69 = 'i' with comment before
+            '<style>@/*comment*/\\69mport url(evil.css)</style>',
+            # \69 = 'i' with comment after
+            '<style>@\\69mport/*comment*/ url(evil.css)</style>',
+            # Multiple escapes with comments
+            '<style>\\65\\78/*comment*/\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
+
+    def test_unicode_escape_case_insensitive(self):
+        # CSS hex escapes should work with both uppercase and lowercase hex digits
+        # \69 = 'i', \6D = 'm', etc.
+        test_cases = [
+            # @import with uppercase hex digits: \69\6D\70\6F\72\74
+            '<style>@\\69\\6D\\70\\6F\\72\\74 url(evil.css)</style>',
+            # @import with some uppercase
+            '<style>@\\69\\6D\\70\\6f\\72\\74 url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
+
+    def test_unicode_escape_various_schemes(self):
+        # Test Unicode escapes for various malicious schemes
+        test_cases = [
+            # \76\62\73\63\72\69\70\74 = "vbscript"
+            '<style>url(\\76\\62\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6a\73\63\72\69\70\74 = "jscript"
+            '<style>url(\\6a\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6c\69\76\65\73\63\72\69\70\74 = "livescript"
+            '<style>url(\\6c\\69\\76\\65\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6d\6f\63\68\61 = "mocha"
+            '<style>url(\\6d\\6f\\63\\68\\61:alert(1))</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
+
+    def test_unicode_escape_with_whitespace_variations(self):
+        # Test different whitespace characters after Unicode escapes
+        cleaner = Cleaner(safe_attrs_only=False)
+        test_cases = [
+            # Tab after escape
+            ('<div style="@\\69\tmport url(evil.css)">test</div>', '<div>test</div>'),
+            # Newline after escape (note: actual newline, not \n)
+            ('<div style="@\\69\nmport url(evil.css)">test</div>', '<div>test</div>'),
+            # Form feed after escape
+            ('<div style="@\\69\fmport url(evil.css)">test</div>', '<div>test</div>'),
+        ]
+
+        for html, expected in test_cases:
+            with self.subTest(html=html):
+                cleaned = cleaner.clean_html(html)
+                self.assertEqual(expected, cleaned)
+
+    def test_backslash_removal_after_unicode_decode(self):
+        # After decoding Unicode escapes, remaining backslashes are removed
+        # This ensures double-obfuscation (unicode + backslashes) is caught
+        test_cases = [
+            # Step 1: \69 → 'i', Step 2: remove \, Result: @import
+            '<style>@\\69\\m\\p\\o\\r\\t url(evil.css)</style>',
+            # Multiple unicode escapes with backslashes mixed in
+            '<style>@\\69\\6d\\p\\6f\\r\\t url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
+
+    def test_backslash_obfuscation_without_unicode(self):
+        # Test that patterns using ONLY backslash obfuscation (no unicode) are caught
+        # Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern
+        test_cases = [
+            # @\i\m\p\o\r\t → @import (caught by '@import' check)
+            '<style>@\\i\\m\\p\\o\\r\\t url(evil.css)</style>',
+            # Can also test combinations that create javascript schemes
+            '<style>@\\import url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)