jeanralphaviles · priv-kweihmann · Apr 12, 2021 · Jun 21, 2022 · Jun 21, 2022 · Jun 21, 2022
diff --git a/comment_parser/parsers/python_parser.py b/comment_parser/parsers/python_parser.py
@@ -9,8 +9,26 @@
 def extract_comments(code):
   """Extracts a list of comments from the given Python script.
 
-  Comments are identified using the tokenize module. Does not include function,
-  class, or module docstrings. All comments are single line comments.
+  Comments are identified using the tokenize module.
+    - Single-lined comments which begin with the '#' character and end with a line-break.
+    - Multi-lined comments or docstrings, which are just triple-quoted strings (start
+      and end with ''' or 3 of these "), are told apart from regular strings by the
+      type of the previous token which should be a line-break or an indentation (NEWLINE,
+      NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in
+      the script). Even in cases like this:
+
+        my_string = \
+        '''this should not be considered a comment'''
+
+        my_string = \
+          '''this should not either''' # <- notice the increasing indentation
+
+        my_string = \
+            '''weird syntax anyway''' # <- but still valid indentation
+
+      the previous token to the string is the '=' operator and not a line-break or an
+      indentation. That way, only triple-quoted strings preceded by a line-break, an
+      indentation, or no token, will be considered intended as comments.
 
   Args:
     code: String containing code to extract comments from.
@@ -19,11 +37,27 @@ def extract_comments(code):
   Raises:
     tokenize.TokenError
   """
+  triplequotes = ['"""', "'''"]
+  multicommprevnums = [
+      tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT,
+      tokenize.DEDENT
+  ]
+  prevtoknum = None  # Stores the previous token's type.
   comments = []
   tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline)
   for toknum, tokstring, tokloc, _, _ in tokens:
+    # Single-lined comment.
     if toknum is tokenize.COMMENT:
       # Removes leading '#' character.
       tokstring = tokstring[1:]
       comments.append(common.Comment(tokstring, tokloc[0], False))
+      continue
+    # Multi-lined comment.
+    if toknum is tokenize.STRING:
+      if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes:
+        if (not prevtoknum) or prevtoknum in multicommprevnums:
+          # Removes the leading and preceding 3ple quotes (""" or ''').
+          tokstring = tokstring[3:-3]
+          comments.append(common.Comment(tokstring, tokloc[0], True))
+    prevtoknum = toknum
   return comments
diff --git a/comment_parser/parsers/tests/python_parser_test.py b/comment_parser/parsers/tests/python_parser_test.py
@@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self):
     comments = python_parser.extract_comments(code)
     expected = [common.Comment(code[3:], 1, multiline=False)]
     self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsDoubleQuoteMultiline(self):
+    code = '"""this is triple quote comment\n'
+    code += 'with\n'
+    code += 'multiple\n'
+    code += 'lines\n'
+    code += '"""'
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('"'), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsDoubleQuoteSingleline(self):
+    code = '"""this is triple quote comment"""'
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('"'), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsSingleQuoteMultiline(self):
+    code = '\'\'\'this is triple quote comment\n'
+    code += 'with\n'
+    code += 'multiple\n'
+    code += 'lines\n'
+    code += '\'\'\''
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('\''), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsSingleQuoteSingleline(self):
+    code = '\'\'\'this is triple quote comment\'\'\''
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('\''), 1, multiline=True)]
+    self.assertEqual(comments, expected)