diff --git a/comment_parser/parsers/python_parser.py b/comment_parser/parsers/python_parser.py index e3e6fa3..103287d 100644 --- a/comment_parser/parsers/python_parser.py +++ b/comment_parser/parsers/python_parser.py @@ -9,8 +9,26 @@ def extract_comments(code): """Extracts a list of comments from the given Python script. - Comments are identified using the tokenize module. Does not include function, - class, or module docstrings. All comments are single line comments. + Comments are identified using the tokenize module. + - Single-lined comments which begin with the '#' character and end with a line-break. + - Multi-lined comments or docstrings, which are just triple-quoted strings (start + and end with ''' or 3 of these "), are told apart from regular strings by the + type of the previous token which should be a line-break or an indentation (NEWLINE, + NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in + the script). Even in cases like this: + + my_string = \ + '''this should not be considered a comment''' + + my_string = \ + '''this should not either''' # <- notice the increasing indentation + + my_string = \ + '''weird syntax anyway''' # <- but still valid indentation + + the previous token to the string is the '=' operator and not a line-break or an + indentation. That way, only triple-quoted strings preceded by a line-break, an + indentation, or no token, will be considered intended as comments. Args: code: String containing code to extract comments from. @@ -19,11 +37,27 @@ def extract_comments(code): Raises: tokenize.TokenError """ + triplequotes = ['"""', "'''"] + multicommprevnums = [ + tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT, + tokenize.DEDENT + ] + prevtoknum = None # Stores the previous token's type. comments = [] tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline) for toknum, tokstring, tokloc, _, _ in tokens: + # Single-lined comment. if toknum is tokenize.COMMENT: # Removes leading '#' character. tokstring = tokstring[1:] comments.append(common.Comment(tokstring, tokloc[0], False)) + continue + # Multi-lined comment. + if toknum is tokenize.STRING: + if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes: + if (not prevtoknum) or prevtoknum in multicommprevnums: + # Removes the leading and preceding 3ple quotes (""" or '''). + tokstring = tokstring[3:-3] + comments.append(common.Comment(tokstring, tokloc[0], True)) + prevtoknum = toknum return comments diff --git a/comment_parser/parsers/tests/python_parser_test.py b/comment_parser/parsers/tests/python_parser_test.py index f6f1a38..4930345 100644 --- a/comment_parser/parsers/tests/python_parser_test.py +++ b/comment_parser/parsers/tests/python_parser_test.py @@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self): comments = python_parser.extract_comments(code) expected = [common.Comment(code[3:], 1, multiline=False)] self.assertEqual(comments, expected) + + def testTripleQuoteCommentsDoubleQuoteMultiline(self): + code = '"""this is triple quote comment\n' + code += 'with\n' + code += 'multiple\n' + code += 'lines\n' + code += '"""' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('"'), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsDoubleQuoteSingleline(self): + code = '"""this is triple quote comment"""' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('"'), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsSingleQuoteMultiline(self): + code = '\'\'\'this is triple quote comment\n' + code += 'with\n' + code += 'multiple\n' + code += 'lines\n' + code += '\'\'\'' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('\''), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsSingleQuoteSingleline(self): + code = '\'\'\'this is triple quote comment\'\'\'' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('\''), 1, multiline=True)] + self.assertEqual(comments, expected)