Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions comment_parser/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,26 @@
def extract_comments(code):
"""Extracts a list of comments from the given Python script.

Comments are identified using the tokenize module. Does not include function,
class, or module docstrings. All comments are single line comments.
Comments are identified using the tokenize module.
- Single-lined comments which begin with the '#' character and end with a line-break.
- Multi-lined comments or docstrings, which are just triple-quoted strings (start
and end with ''' or 3 of these "), are told apart from regular strings by the
type of the previous token which should be a line-break or an indentation (NEWLINE,
NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in
the script). Even in cases like this:

my_string = \
'''this should not be considered a comment'''

my_string = \
'''this should not either''' # <- notice the increasing indentation

my_string = \
'''weird syntax anyway''' # <- but still valid indentation

the previous token to the string is the '=' operator and not a line-break or an
indentation. That way, only triple-quoted strings preceded by a line-break, an
indentation, or no token, will be considered intended as comments.

Args:
code: String containing code to extract comments from.
Expand All @@ -19,11 +37,27 @@ def extract_comments(code):
Raises:
tokenize.TokenError
"""
triplequotes = ['"""', "'''"]
multicommprevnums = [
tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT,
tokenize.DEDENT
]
prevtoknum = None # Stores the previous token's type.
comments = []
tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline)
for toknum, tokstring, tokloc, _, _ in tokens:
# Single-lined comment.
if toknum is tokenize.COMMENT:
# Removes leading '#' character.
tokstring = tokstring[1:]
comments.append(common.Comment(tokstring, tokloc[0], False))
continue
# Multi-lined comment.
if toknum is tokenize.STRING:
if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes:
if (not prevtoknum) or prevtoknum in multicommprevnums:
# Removes the leading and preceding 3ple quotes (""" or ''').
tokstring = tokstring[3:-3]
comments.append(common.Comment(tokstring, tokloc[0], True))
prevtoknum = toknum
return comments
44 changes: 44 additions & 0 deletions comment_parser/parsers/tests/python_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self):
comments = python_parser.extract_comments(code)
expected = [common.Comment(code[3:], 1, multiline=False)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsDoubleQuoteMultiline(self):
code = '"""this is triple quote comment\n'
code += 'with\n'
code += 'multiple\n'
code += 'lines\n'
code += '"""'
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('"'), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsDoubleQuoteSingleline(self):
code = '"""this is triple quote comment"""'
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('"'), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsSingleQuoteMultiline(self):
code = '\'\'\'this is triple quote comment\n'
code += 'with\n'
code += 'multiple\n'
code += 'lines\n'
code += '\'\'\''
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('\''), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsSingleQuoteSingleline(self):
code = '\'\'\'this is triple quote comment\'\'\''
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('\''), 1, multiline=True)]
self.assertEqual(comments, expected)