--- /dev/null
+name: bot
+
+on:
+ pull_request:
+ branches:
+ - '**'
+
+jobs:
+ require_changelog:
+
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: mskelton/changelog-reminder-action@v1
+ with:
+ # Match any file in the docs/change_log/ dir.
+ changelogRegex: "docs/change_log/.*"
+ # Only require changelog update if changes were made in markdown/
+ include: "markdown/.*"
+ message: |
+ @${{ github.actor }}, thank you for your contribution. It appears that you have not added a comment to the
+ change log describing the changes you have made. Doing so will help to ensure your contribution is accepted.
+
+ Please see the [Contributing Guide](https://python-markdown.github.io/contributing/#pull-requests) for details.
Python-Markdown Change Log
=========================
+Oct 25, 2020: version 3.3.3 (a bug-fix release).
+
+* Unify all block-level tags (#1047).
+* Fix issue where some empty elements would have text rendered as `None` when using `md_in_html` (#1049).
+* Avoid catastrophic backtracking in `hr` regex (#1055).
+* Fix `hr` HTML handling (#1053).
+
Oct 19, 2020: version 3.3.2 (a bug-fix release).
* Properly parse inline HTML in md_in_html (#1040 & #1045).
When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag.
-The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`,
-`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`,
-`main`, `menu`, `nav`, `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and
-`ul`.
+The following tags have the `block` behavior by default: `article`, `aside`, `blockquote`, `body`, `colgroup`,
+`details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `group`, `header`, `hgroup`, `hr`,
+`iframe`, `main`, `map`, `menu`, `nav`, `noscript`, `object`, `ol`, `output`, `progress`, `section`, `table`,
+`tbody`, `tfoot`, `thead`, `tr`, `ul` and `video`.
For example, the following:
# (1, 2, 0, 'beta', 2) => "1.2b2"
# (1, 2, 0, 'rc', 4) => "1.2rc4"
# (1, 2, 0, 'final', 0) => "1.2"
-__version_info__ = (3, 3, 2, 'final', 0)
+__version_info__ = (3, 3, 3, 'final', 0)
def _get_version(version_info):
class HRProcessor(BlockProcessor):
""" Process Horizontal Rules. """
- RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*$'
+ # Python's re module doesn't officially support atomic grouping. However you can fake it.
+ # See https://stackoverflow.com/a/13577411/866026
+ RE = r'^[ ]{0,3}(?=(?P<atomicgroup>(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$'
# Detect hr on any line of a block.
SEARCH_RE = re.compile(RE, re.MULTILINE)
def test(self, parent, block):
m = self.SEARCH_RE.search(block)
- # No atomic grouping in python so we simulate it here for performance.
- # The regex only matches what would be in the atomic group - the HR.
- # Then check if we are at end of block or if next char is a newline.
- if m and (m.end() == len(block) or block[m.end()] == '\n'):
+ if m:
# Save match object on class instance so we can use it later.
self.match = m
return True
# See https://w3c.github.io/html/grouping-content.html#the-p-element
'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
- 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
- 'section', 'table', 'ul',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
# Other elements which Markdown should not be mucking up the contents of.
- 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
- 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
]
self.registeredExtensions = []
import xml.etree.ElementTree as etree
-# Block-level tags in which the content only gets span level parsing
-span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
-
-# Block-level tags in which the content gets parsed as blocks
-block_tags = [
- 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset',
- 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map',
- 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul'
-]
-
-# Block-level tags which never get their content parsed.
-raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
-
-block_level_tags = span_tags + block_tags + raw_tags
-
-
class HTMLExtractorExtra(HTMLExtractor):
"""
Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
"""
+ def __init__(self, md, *args, **kwargs):
+ # All block-level tags.
+ self.block_level_tags = set(md.block_level_elements.copy())
+ # Block-level tags in which the content only gets span level parsing
+ self.span_tags = set(
+ ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+ )
+ # Block-level tags which never get their content parsed.
+ self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
+ # Block-level tags in which the content gets parsed as blocks
+ super().__init__(md, *args, **kwargs)
+
+ self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+ self.span_and_blocks_tags = self.block_tags | self.span_tags
+
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.mdstack = [] # When markdown=1, stack contains a list of tags
if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
# Only use the parent state if it is more restrictive than the markdown attribute.
md_attr = parent_state
- if ((md_attr == '1' and tag in block_tags) or
- (md_attr == 'block' and tag in span_tags + block_tags)):
+ if ((md_attr == '1' and tag in self.block_tags) or
+ (md_attr == 'block' and tag in self.span_and_blocks_tags)):
return 'block'
- elif ((md_attr == '1' and tag in span_tags) or
- (md_attr == 'span' and tag in span_tags + block_tags)):
+ elif ((md_attr == '1' and tag in self.span_tags) or
+ (md_attr == 'span' and tag in self.span_and_blocks_tags)):
return 'span'
- elif tag in block_level_tags:
+ elif tag in self.block_level_tags:
return 'off'
else: # pragma: no cover
return None
return value
def handle_starttag(self, tag, attrs):
- if tag in block_level_tags:
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags:
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, True)
+ return
+
+ if tag in self.block_level_tags:
# Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
# Convert to `{'checked': 'checked'}`.
attrs = {key: value if value is not None else key for key, value in attrs}
attrs.pop('markdown', None)
super().handle_starttag(tag, attrs)
else:
- if 'p' in self.mdstack and tag in block_level_tags:
+ if 'p' in self.mdstack and tag in self.block_level_tags:
# Close unclosed 'p' tag
self.handle_endtag('p')
self.mdstate.append(state)
self.handle_data(text)
def handle_endtag(self, tag):
- if tag in block_level_tags:
+ if tag in self.block_level_tags:
if self.inraw:
super().handle_endtag(tag)
elif tag in self.mdstack:
else:
self.handle_data(text)
+ def handle_startendtag(self, tag, attrs):
+ if tag in self.empty_tags:
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
def handle_data(self, data):
if self.inraw or not self.mdstack:
super().handle_data(data)
else:
# Disable inline parsing for everything else
+ if element.text is None:
+ element.text = ''
element.text = util.AtomicString(element.text)
for child in list(element):
self.parse_element_content(child)
def __init__(self, md, *args, **kwargs):
if 'convert_charrefs' not in kwargs:
kwargs['convert_charrefs'] = False
+
+ # Block tags that should contain no content (self closing)
+ self.empty_tags = set(['hr'])
+
# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
return '</{}>'.format(tag)
def handle_starttag(self, tag, attrs):
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags:
+ self.handle_startendtag(tag, attrs)
+ return
+
if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
# Started a new raw block. Prepare stack.
self.inraw = True
else:
# More content exists after tag.
self.intail = True
+ item = self.cleandoc[-1] if self.cleandoc else ''
+ # If we only have one newline before block element, add another
+ if not item.endswith('\n\n') and item.endswith('\n'):
+ self.cleandoc.append('\n')
self.cleandoc.append(self.md.htmlStash.store(data))
# Insert blank line between this and next line.
self.cleandoc.append('\n\n')
# See https://w3c.github.io/html/grouping-content.html#the-p-element
'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
- 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
- 'section', 'table', 'ul',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
# Other elements which Markdown should not be mucking up the contents of.
- 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
- 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
]
# Placeholders
'<p>_ _</p>'
)
+
+ def test_2_consecutive_hr(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ - - -
+ - - -
+ """
+ ),
+ self.dedent(
+ """
+ <hr />
+ <hr />
+ """
+ )
+ )
+
+ def test_not_hr_end_in_char(self):
+ self.assertMarkdownRenders(
+ '--------------------------------------c',
+
+ '<p>--------------------------------------c</p>'
+ )
"""
)
)
+
+ def test_hr_only_start(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_hr_self_close(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr/>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr/>
+ <p><em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_hr_start_and_end(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr></hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p></hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_hr_only_end(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ </hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em>
+ </hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_hr_with_content(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ # Content is not allowed and will be treated as normal content between two hr tags.
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr>
+ **content**
+ </hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><strong>content</strong>
+ </hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
)
)
+ def test_empty_tags(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ <div markdown="1">
+ <div></div>
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <div>
+ <div></div>
+ </div>
+ """
+ )
+ )
+
def test_orphan_end_tag_in_raw_html(self):
self.assertMarkdownRenders(
self.dedent(
)
)
+ def test_md1_hr_only_start(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr markdown="1">
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_md1_hr_self_close(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr markdown="1" />
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_md1_hr_start_and_end(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr markdown="1"></hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p></hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_md1_hr_only_end(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ </hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em>
+ </hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_md1_hr_with_content(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ # Content is not allowed and will be treated as normal content between two hr tags
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr markdown="1">
+ **content**
+ </hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><strong>content</strong>
+ </hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
+ def test_no_md1_hr_with_content(self):
+ # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+ # Content is not allowed and will be treated as normal content between two hr tags
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ *emphasis1*
+ <hr>
+ **content**
+ </hr>
+ *emphasis2*
+ """
+ ),
+ self.dedent(
+ """
+ <p><em>emphasis1</em></p>
+ <hr>
+ <p><strong>content</strong>
+ </hr>
+ <em>emphasis2</em></p>
+ """
+ )
+ )
+
def test_md1_nested_abbr_ref(self):
self.assertMarkdownRenders(
self.dedent(