From: DongHun Kwak Date: Fri, 15 Jul 2022 00:02:43 +0000 (+0900) Subject: Imported Upstream version 3.3.3 X-Git-Tag: upstream/3.3.3^0 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a8f8fe7d98d7fc8ae6402b63578da104be21febf;p=platform%2Fupstream%2Fpython3-markdown.git Imported Upstream version 3.3.3 --- diff --git a/.github/workflows/process.yml b/.github/workflows/process.yml new file mode 100644 index 0000000..6e3ac51 --- /dev/null +++ b/.github/workflows/process.yml @@ -0,0 +1,24 @@ +name: bot + +on: + pull_request: + branches: + - '**' + +jobs: + require_changelog: + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: mskelton/changelog-reminder-action@v1 + with: + # Match any file in the docs/change_log/ dir. + changelogRegex: "docs/change_log/.*" + # Only require changelog update if changes were made in markdown/ + include: "markdown/.*" + message: | + @${{ github.actor }}, thank you for your contribution. It appears that you have not added a comment to the + change log describing the changes you have made. Doing so will help to ensure your contribution is accepted. + + Please see the [Contributing Guide](https://python-markdown.github.io/contributing/#pull-requests) for details. diff --git a/docs/change_log/index.md b/docs/change_log/index.md index 47e8f9e..632449a 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -3,6 +3,13 @@ title: Change Log Python-Markdown Change Log ========================= +Oct 25, 2020: version 3.3.3 (a bug-fix release). + +* Unify all block-level tags (#1047). +* Fix issue where some empty elements would have text rendered as `None` when using `md_in_html` (#1049). +* Avoid catastrophic backtracking in `hr` regex (#1055). +* Fix `hr` HTML handling (#1053). + Oct 19, 2020: version 3.3.2 (a bug-fix release). * Properly parse inline HTML in md_in_html (#1040 & #1045). diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md index ba4424b..978f5c3 100644 --- a/docs/extensions/md_in_html.md +++ b/docs/extensions/md_in_html.md @@ -25,10 +25,10 @@ The `markdown` attribute can be assigned one of three values: [`"1"`](#1), [`"bl When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag. -The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`, -`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`, -`main`, `menu`, `nav`, `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and -`ul`. +The following tags have the `block` behavior by default: `article`, `aside`, `blockquote`, `body`, `colgroup`, +`details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `group`, `header`, `hgroup`, `hr`, +`iframe`, `main`, `map`, `menu`, `nav`, `noscript`, `object`, `ol`, `output`, `progress`, `section`, `table`, +`tbody`, `tfoot`, `thead`, `tr`, `ul` and `video`. For example, the following: diff --git a/markdown/__meta__.py b/markdown/__meta__.py index a951fb8..c5b5a33 100644 --- a/markdown/__meta__.py +++ b/markdown/__meta__.py @@ -26,7 +26,7 @@ License: BSD (see LICENSE.md for details). # (1, 2, 0, 'beta', 2) => "1.2b2" # (1, 2, 0, 'rc', 4) => "1.2rc4" # (1, 2, 0, 'final', 0) => "1.2" -__version_info__ = (3, 3, 2, 'final', 0) +__version_info__ = (3, 3, 3, 'final', 0) def _get_version(version_info): diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index 742f174..7d31a7f 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -496,16 +496,15 @@ class SetextHeaderProcessor(BlockProcessor): class HRProcessor(BlockProcessor): """ Process Horizontal Rules. """ - RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*$' + # Python's re module doesn't officially support atomic grouping. However you can fake it. + # See https://stackoverflow.com/a/13577411/866026 + RE = r'^[ ]{0,3}(?=(?P(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$' # Detect hr on any line of a block. SEARCH_RE = re.compile(RE, re.MULTILINE) def test(self, parent, block): m = self.SEARCH_RE.search(block) - # No atomic grouping in python so we simulate it here for performance. - # The regex only matches what would be in the atomic group - the HR. - # Then check if we are at end of block or if next char is a newline. - if m and (m.end() == len(block) or block[m.end()] == '\n'): + if m: # Save match object on class instance so we can use it later. self.match = m return True diff --git a/markdown/core.py b/markdown/core.py index 79ca3f3..2f7f2d5 100644 --- a/markdown/core.py +++ b/markdown/core.py @@ -77,11 +77,12 @@ class Markdown: # See https://w3c.github.io/html/grouping-content.html#the-p-element 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', - 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre', - 'section', 'table', 'ul', + 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul', # Other elements which Markdown should not be mucking up the contents of. - 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output', - 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video' + 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend', + 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script', + 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video' ] self.registeredExtensions = [] diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index f635563..eb8902e 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -23,27 +23,26 @@ from ..htmlparser import HTMLExtractor import xml.etree.ElementTree as etree -# Block-level tags in which the content only gets span level parsing -span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] - -# Block-level tags in which the content gets parsed as blocks -block_tags = [ - 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset', - 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map', - 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul' -] - -# Block-level tags which never get their content parsed. -raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'] - -block_level_tags = span_tags + block_tags + raw_tags - - class HTMLExtractorExtra(HTMLExtractor): """ Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown. """ + def __init__(self, md, *args, **kwargs): + # All block-level tags. + self.block_level_tags = set(md.block_level_elements.copy()) + # Block-level tags in which the content only gets span level parsing + self.span_tags = set( + ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] + ) + # Block-level tags which never get their content parsed. + self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']) + # Block-level tags in which the content gets parsed as blocks + super().__init__(md, *args, **kwargs) + + self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags) + self.span_and_blocks_tags = self.block_tags | self.span_tags + def reset(self): """Reset this instance. Loses all unprocessed data.""" self.mdstack = [] # When markdown=1, stack contains a list of tags @@ -75,13 +74,13 @@ class HTMLExtractorExtra(HTMLExtractor): if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): # Only use the parent state if it is more restrictive than the markdown attribute. md_attr = parent_state - if ((md_attr == '1' and tag in block_tags) or - (md_attr == 'block' and tag in span_tags + block_tags)): + if ((md_attr == '1' and tag in self.block_tags) or + (md_attr == 'block' and tag in self.span_and_blocks_tags)): return 'block' - elif ((md_attr == '1' and tag in span_tags) or - (md_attr == 'span' and tag in span_tags + block_tags)): + elif ((md_attr == '1' and tag in self.span_tags) or + (md_attr == 'span' and tag in self.span_and_blocks_tags)): return 'span' - elif tag in block_level_tags: + elif tag in self.block_level_tags: return 'off' else: # pragma: no cover return None @@ -95,7 +94,19 @@ class HTMLExtractorExtra(HTMLExtractor): return value def handle_starttag(self, tag, attrs): - if tag in block_level_tags: + # Handle tags that should always be empty and do not specify a closing tag + if tag in self.empty_tags: + attrs = {key: value if value is not None else key for key, value in attrs} + if "markdown" in attrs: + attrs.pop('markdown') + element = etree.Element(tag, attrs) + data = etree.tostring(element, encoding='unicode', method='html') + else: + data = self.get_starttag_text() + self.handle_empty_tag(data, True) + return + + if tag in self.block_level_tags: # Valueless attr (ex: ``) results in `[('checked', None)]`. # Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} @@ -106,7 +117,7 @@ class HTMLExtractorExtra(HTMLExtractor): attrs.pop('markdown', None) super().handle_starttag(tag, attrs) else: - if 'p' in self.mdstack and tag in block_level_tags: + if 'p' in self.mdstack and tag in self.block_level_tags: # Close unclosed 'p' tag self.handle_endtag('p') self.mdstate.append(state) @@ -125,7 +136,7 @@ class HTMLExtractorExtra(HTMLExtractor): self.handle_data(text) def handle_endtag(self, tag): - if tag in block_level_tags: + if tag in self.block_level_tags: if self.inraw: super().handle_endtag(tag) elif tag in self.mdstack: @@ -166,6 +177,19 @@ class HTMLExtractorExtra(HTMLExtractor): else: self.handle_data(text) + def handle_startendtag(self, tag, attrs): + if tag in self.empty_tags: + attrs = {key: value if value is not None else key for key, value in attrs} + if "markdown" in attrs: + attrs.pop('markdown') + element = etree.Element(tag, attrs) + data = etree.tostring(element, encoding='unicode', method='html') + else: + data = self.get_starttag_text() + else: + data = self.get_starttag_text() + self.handle_empty_tag(data, is_block=self.md.is_block_level(tag)) + def handle_data(self, data): if self.inraw or not self.mdstack: super().handle_data(data) @@ -265,6 +289,8 @@ class MarkdownInHtmlProcessor(BlockProcessor): else: # Disable inline parsing for everything else + if element.text is None: + element.text = '' element.text = util.AtomicString(element.text) for child in list(element): self.parse_element_content(child) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 6776d34..fee9cd5 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -56,6 +56,10 @@ class HTMLExtractor(htmlparser.HTMLParser): def __init__(self, md, *args, **kwargs): if 'convert_charrefs' not in kwargs: kwargs['convert_charrefs'] = False + + # Block tags that should contain no content (self closing) + self.empty_tags = set(['hr']) + # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -120,6 +124,11 @@ class HTMLExtractor(htmlparser.HTMLParser): return ''.format(tag) def handle_starttag(self, tag, attrs): + # Handle tags that should always be empty and do not specify a closing tag + if tag in self.empty_tags: + self.handle_startendtag(tag, attrs) + return + if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): # Started a new raw block. Prepare stack. self.inraw = True @@ -183,6 +192,10 @@ class HTMLExtractor(htmlparser.HTMLParser): else: # More content exists after tag. self.intail = True + item = self.cleandoc[-1] if self.cleandoc else '' + # If we only have one newline before block element, add another + if not item.endswith('\n\n') and item.endswith('\n'): + self.cleandoc.append('\n') self.cleandoc.append(self.md.htmlStash.store(data)) # Insert blank line between this and next line. self.cleandoc.append('\n\n') diff --git a/markdown/util.py b/markdown/util.py index a49486b..2cb2317 100644 --- a/markdown/util.py +++ b/markdown/util.py @@ -58,11 +58,12 @@ BLOCK_LEVEL_ELEMENTS = [ # See https://w3c.github.io/html/grouping-content.html#the-p-element 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', - 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre', - 'section', 'table', 'ul', + 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul', # Other elements which Markdown should not be mucking up the contents of. - 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output', - 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video' + 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend', + 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script', + 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video' ] # Placeholders diff --git a/tests/test_syntax/blocks/test_hr.py b/tests/test_syntax/blocks/test_hr.py index 009a39d..85a51b3 100644 --- a/tests/test_syntax/blocks/test_hr.py +++ b/tests/test_syntax/blocks/test_hr.py @@ -377,3 +377,26 @@ class TestHorizontalRules(TestCase): '

_ _

' ) + + def test_2_consecutive_hr(self): + self.assertMarkdownRenders( + self.dedent( + """ + - - - + - - - + """ + ), + self.dedent( + """ +
+
+ """ + ) + ) + + def test_not_hr_end_in_char(self): + self.assertMarkdownRenders( + '--------------------------------------c', + + '

--------------------------------------c

' + ) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 3fea766..589f682 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1402,3 +1402,102 @@ class TestHTMLBlocks(TestCase): """ ) ) + + def test_hr_only_start(self): + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

emphasis2

+ """ + ) + ) + + def test_hr_self_close(self): + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

emphasis2

+ """ + ) + ) + + def test_hr_start_and_end(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

+ emphasis2

+ """ + ) + ) + + def test_hr_only_end(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* + + *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1 + + emphasis2

+ """ + ) + ) + + def test_hr_with_content(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + # Content is not allowed and will be treated as normal content between two hr tags. + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ **content** + + *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

content + + emphasis2

+ """ + ) + ) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 946e922..824917c 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -390,6 +390,24 @@ class TestMdInHTML(TestCase): ) ) + def test_empty_tags(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
+
+ """ + ), + self.dedent( + """ +
+
+
+ """ + ) + ) + def test_orphan_end_tag_in_raw_html(self): self.assertMarkdownRenders( self.dedent( @@ -875,6 +893,129 @@ class TestMdInHTML(TestCase): ) ) + def test_md1_hr_only_start(self): + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

emphasis2

+ """ + ) + ) + + def test_md1_hr_self_close(self): + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

emphasis2

+ """ + ) + ) + + def test_md1_hr_start_and_end(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

+ emphasis2

+ """ + ) + ) + + def test_md1_hr_only_end(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* + + *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1 + + emphasis2

+ """ + ) + ) + + def test_md1_hr_with_content(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + # Content is not allowed and will be treated as normal content between two hr tags + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ **content** + + *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

content + + emphasis2

+ """ + ) + ) + + def test_no_md1_hr_with_content(self): + # Browers ignore ending hr tags, so we don't try to do anything to handle them special. + # Content is not allowed and will be treated as normal content between two hr tags + self.assertMarkdownRenders( + self.dedent( + """ + *emphasis1* +
+ **content** + + *emphasis2* + """ + ), + self.dedent( + """ +

emphasis1

+
+

content + + emphasis2

+ """ + ) + ) + def test_md1_nested_abbr_ref(self): self.assertMarkdownRenders( self.dedent(