From 46046c0f7125911ff8205f09a7574573bb953105 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Mon, 23 Nov 2015 15:17:07 +0000 Subject: [PATCH 1/3] Make lxml tree-builder coerce comments to work with lxml 3.5. --- html5lib/ihatexml.py | 2 ++ html5lib/treebuilders/etree_lxml.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 0fc7930..b5b2e98 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -225,6 +225,8 @@ def coerceComment(self, data): while "--" in data: warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning) data = data.replace("--", "- -") + if data.endswith("-"): + data += " " return data def coerceCharacters(self, data): diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 35d08ef..17007e3 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -189,7 +189,7 @@ class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) - infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() + infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): From 1c22e1ce93dd4acc81a66cfa03cf9720fbd741c7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Mon, 23 Nov 2015 15:35:21 +0000 Subject: [PATCH 2/3] fixup! Make lxml tree-builder coerce comments to work with lxml 3.5. --- html5lib/ihatexml.py | 1 + html5lib/treebuilders/etree_lxml.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index b5b2e98..5a81a12 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -226,6 +226,7 @@ def coerceComment(self, data): warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning) data = data.replace("--", "- -") if data.endswith("-"): + warnings.warn("Comments cannot contain end in a dash", DataLossWarning) data += " " return data diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 17007e3..c6c981f 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -54,7 +54,7 @@ def _getChildNodes(self): def testSerializer(element): rv = [] finalText = None - infosetFilter = ihatexml.InfosetFilter() + infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): if not hasattr(element, "tag"): @@ -257,7 +257,7 @@ def _getData(self): data = property(_getData, _setData) self.elementClass = Element - self.commentClass = builder.Comment + self.commentClass = Comment # self.fragmentClass = builder.DocumentFragment _base.TreeBuilder.__init__(self, namespaceHTMLElements) @@ -344,7 +344,8 @@ def insertRoot(self, token): # Append the initial comments: for comment_token in self.initial_comments: - root.addprevious(etree.Comment(comment_token["data"])) + comment = self.commentClass(comment_token["data"]) + root.addprevious(comment._element) # Create the root document and add the ElementTree to it self.document = self.documentClass() From 235a6d7ac7e0a3e2b431766e051094c2d3110ba3 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Mon, 23 Nov 2015 15:42:12 +0000 Subject: [PATCH 3/3] fixup! Make lxml tree-builder coerce comments to work with lxml 3.5. --- html5lib/ihatexml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 5a81a12..5da5d93 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -226,7 +226,7 @@ def coerceComment(self, data): warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning) data = data.replace("--", "- -") if data.endswith("-"): - warnings.warn("Comments cannot contain end in a dash", DataLossWarning) + warnings.warn("Comments cannot end in a dash", DataLossWarning) data += " " return data