cpp.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #
  2. # Metrix++, Copyright 2009-2019, Metrix++ Project
  3. # Link: https://github.com/metrixplusplus/metrixplusplus
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. import re
  8. import binascii
  9. import mpp.api
  10. import mpp.cout
  11. class Plugin(mpp.api.Plugin, mpp.api.Parent, mpp.api.IParser, mpp.api.IConfigurable, mpp.api.ICode):
  12. def declare_configuration(self, parser):
  13. parser.add_option("--std.code.cpp.files", default="*.c,*.h,*.cpp,*.hpp,*.cc,*.hh,*.cxx,*.hxx",
  14. help="Enumerates filename extensions to match C/C++ files [default: %default]")
  15. def configure(self, options):
  16. self.files = options.__dict__['std.code.cpp.files'].split(',')
  17. self.files.sort() # sorted list goes to properties
  18. def initialize(self):
  19. mpp.api.Plugin.initialize(self, properties=[
  20. self.Property('files', ','.join(self.files))
  21. ])
  22. self.get_plugin('std.tools.collect').register_parser(self.files, self)
  23. def process(self, parent, data, is_updated):
  24. is_updated = is_updated or self.is_updated
  25. count_mismatched_brackets = 0
  26. if is_updated == True:
  27. count_mismatched_brackets = CppCodeParser().run(data)
  28. #else:
  29. # data.load_regions()
  30. #data.load_markers()
  31. self.notify_children(data, is_updated)
  32. # TODO: if not updated number of parser errors is zero, should read from the prev database
  33. # but reading of number of errors from the database will slow the process
  34. # maybe it is better to return zero always?
  35. return count_mismatched_brackets
  36. class CppCodeParser(object):
  37. regex_cpp = re.compile(r'''
  38. /([\\](?:\n|\r\n|\r))*/(?=\n|\r\n|\r) # Match C++ style comments (empty comment line)
  39. | /([\\](?:\n|\r\n|\r))*/.*?[^\\](?=\n|\r\n|\r) # Match C++ style comments
  40. # NOTE: end of line is NOT consumed
  41. # NOTE: ([\\](?:\n|\r\n|\r))* for new line separators,
  42. # Need to support new line separators in expense of efficiency?
  43. | /\*\*/ # Match C style comments (empty comment line)
  44. | /([\\](?:\n|\r\n|\r))*\*.*?\*([\\](?:\n|\r\n|\r))*/ # Match C style comments
  45. | \'(?:\\.|[^\\\'])*\' # Match quoted strings
  46. | "(?:\\.|[^\\"])*" # Match double quoted strings
  47. | (((?<=\n|\r)|^)[ \t]*[#].*?[^\\](?=\n|\r\n|\r)) # Match preprocessor
  48. # NOTE: end of line is NOT consumed
  49. # NOTE: beginning of line is NOT consumed
  50. | (?P<fn_name>
  51. (operator( # Match C++ operator ...
  52. (\s+[_a-zA-Z][_a-zA-Z0-9]*(\s*\[\s*\])?) # - cast, new and delete operators
  53. | (\s*\[\s*\]) # - operator []
  54. | (\s*\(\s*\)) # - operator ()
  55. | (\s*[+-\\*/=<>!%&^|~,?.]{1,3}) # - other operators (from 1 to 3 symbols)
  56. ))
  57. | ([~]?[_a-zA-Z][_a-zA-Z0-9]*) # ... or function or constructor
  58. )\s*[(] # LIMITATION: if there are comments after function name
  59. # and before '(', it is not detected
  60. # LIMITATION: if there are comments within operator definition,
  61. # if may be not detected
  62. | ((?P<block_type>\bclass|\bstruct|\bunion|\bnamespace) # Match C++ class or struct
  63. (?P<block_name>((\s+[a-zA-Z_][a-zA-Z0-9_]*)|(?=\s*[{])))) # noname is supported, symbol '{' is not consumed
  64. # LIMITATION: if there are comments between keyword and name,
  65. # it is not detected
  66. | [<>{};:] # Match block start/end, brackets and statement separator
  67. | ((?:\n|\r\n|\r)\s*(?:\n|\r\n|\r)) # Match double empty line
  68. ''',
  69. re.DOTALL | re.MULTILINE | re.VERBOSE
  70. )
  71. # \r\n goes before \r in order to consume right number of lines on Unix for Windows files
  72. regex_ln = re.compile(r'(\n)|(\r\n)|(\r)')
  73. def run(self, data):
  74. self.__init__() # Go to initial state if it is called twice
  75. return self.parse(data)
  76. def finalize_block(self, text, block, block_end):
  77. if block['type'] != '__global__':
  78. # do not trim spaces for __global__region
  79. space_match = re.match('^\s*', text[block['start']:block_end], re.MULTILINE)
  80. block['start'] += space_match.end() # trim spaces at the beginning
  81. block['end'] = block_end
  82. start_pos = block['start']
  83. crc32 = 0
  84. for child in block['children']:
  85. # exclude children
  86. crc32 = binascii.crc32(text[start_pos:child['start']].encode('utf8'), crc32)
  87. start_pos = child['end']
  88. block['checksum'] = binascii.crc32(text[start_pos:block['end']].encode('utf8'), crc32) & 0xffffffff # to match python 3
  89. def add_lines_data(self, text, blocks):
  90. def add_lines_data_rec(self, text, blocks):
  91. for each in blocks:
  92. # add line begin
  93. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['start']))
  94. each['line_begin'] = self.total_current
  95. self.total_last_pos = each['start']
  96. # process enclosed
  97. add_lines_data_rec(self, text, each['children'])
  98. # add line end
  99. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['end']))
  100. each['line_end'] = self.total_current
  101. self.total_last_pos = each['end']
  102. self.total_last_pos = 0
  103. self.total_current = 1
  104. add_lines_data_rec(self, text, blocks)
  105. def add_regions(self, data, blocks):
  106. # Note: data.add_region() internals depend on special ordering of regions
  107. # in order to identify enclosed regions efficiently
  108. def add_regions_rec(self, data, blocks):
  109. def get_type_id(data, named_type):
  110. if named_type == "function":
  111. return mpp.api.Region.T.FUNCTION
  112. elif named_type == "class":
  113. return mpp.api.Region.T.CLASS
  114. elif named_type == "struct":
  115. return mpp.api.Region.T.STRUCT
  116. elif named_type == "union":
  117. return mpp.api.Region.T.STRUCT
  118. elif named_type == "namespace":
  119. return mpp.api.Region.T.NAMESPACE
  120. elif named_type == "__global__":
  121. return mpp.api.Region.T.GLOBAL
  122. else:
  123. assert(False)
  124. for each in blocks:
  125. data.add_region(each['name'], each['start'], each['end'],
  126. each['line_begin'], each['line_end'], each['cursor'],
  127. get_type_id(data, each['type']), each['checksum'])
  128. add_regions_rec(self, data, each['children'])
  129. add_regions_rec(self, data, blocks)
  130. def parse(self, data):
  131. def reset_next_block(start):
  132. return {'name':'', 'start':start, 'cursor':0, 'type':'', 'confirmed':False}
  133. count_mismatched_brackets = 0
  134. text = data.get_content()
  135. indent_current = 0;
  136. blocks = [{'name':'__global__', 'start':0, 'cursor':0, 'type':'__global__', 'indent_start':indent_current, 'children':[]}]
  137. curblk = 0
  138. next_block = reset_next_block(0)
  139. cursor_last_pos = 0
  140. cursor_current = 1
  141. for m in re.finditer(self.regex_cpp, text):
  142. # Comment
  143. if text[m.start()] == '/':
  144. data.add_marker(m.start(), m.end(), mpp.api.Marker.T.COMMENT)
  145. # String
  146. elif text[m.start()] == '"' or text[m.start()] == '\'':
  147. data.add_marker(m.start() + 1, m.end() - 1, mpp.api.Marker.T.STRING)
  148. # Preprocessor (including internal comments)
  149. elif text[m.start()] == ' ' or text[m.start()] == '\t' or text[m.start()] == '#':
  150. data.add_marker(m.start(), m.end(), mpp.api.Marker.T.PREPROCESSOR)
  151. # Statement end
  152. elif text[m.start()] == ';':
  153. # Reset next block name and start
  154. next_block['name'] = ""
  155. next_block['start'] = m.end() # potential region start
  156. # Template argument closing bracket
  157. elif text[m.start()] == '>':
  158. # Reset next block name (in order to skip class names in templates), if has not been confirmed before
  159. if next_block['confirmed'] == False and (next_block['type'] == 'class' or next_block['type'] == 'struct'):
  160. next_block['name'] = ""
  161. # Template argument opening bracket or after class inheritance specification
  162. elif text[m.start()] == ':' or text[m.start()] == '<':
  163. # .. if goes after calss definition
  164. if next_block['type'] == 'class' or next_block['type'] == 'struct':
  165. next_block['confirmed'] = True
  166. # Double end line
  167. elif text[m.start()] == '\n' or text[m.start()] == '\r':
  168. # Reset next block start, if has not been named yet
  169. if next_block['name'] == "":
  170. next_block['start'] = m.end() # potential region start
  171. # Block start...
  172. elif text[m.start()] == '{':
  173. # shift indent right
  174. indent_current += 1
  175. # ... if name detected previously
  176. if next_block['name'] != '': # - Start of enclosed block
  177. blocks.append({'name':next_block['name'],
  178. 'start':next_block['start'],
  179. 'cursor':next_block['cursor'],
  180. 'type':next_block['type'],
  181. 'indent_start':indent_current,
  182. 'children':[]})
  183. next_block = reset_next_block(m.end())
  184. curblk += 1
  185. # ... reset next block start, otherwise
  186. else: # - unknown type of block start
  187. next_block['start'] = m.end() # potential region start
  188. # Block end...
  189. elif text[m.start()] == '}':
  190. # ... if indent level matches the start
  191. if blocks[curblk]['indent_start'] == indent_current:
  192. next_block = reset_next_block(m.end())
  193. if curblk == 0:
  194. mpp.cout.notify(data.get_path(),
  195. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start())),
  196. mpp.cout.SEVERITY_WARNING,
  197. "Non-matching closing bracket '}' detected.")
  198. count_mismatched_brackets += 1
  199. continue
  200. self.finalize_block(text, blocks[curblk], m.end())
  201. assert(blocks[curblk]['type'] != '__global__')
  202. curblk -= 1
  203. assert(curblk >= 0)
  204. blocks[curblk]['children'].append(blocks.pop())
  205. # shift indent left
  206. indent_current -= 1
  207. if indent_current < 0:
  208. mpp.cout.notify(data.get_path(),
  209. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start())),
  210. mpp.cout.SEVERITY_WARNING,
  211. "Non-matching closing bracket '}' detected.")
  212. count_mismatched_brackets += 1
  213. indent_current = 0
  214. # Potential namespace, struct, class
  215. elif m.group('block_type') != None:
  216. if next_block['name'] == "":
  217. # - 'name'
  218. next_block['name'] = m.group('block_name').strip()
  219. if next_block['name'] == "":
  220. next_block['name'] = '__noname__'
  221. # - 'cursor'
  222. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('block_name')))
  223. cursor_last_pos = m.start('block_name')
  224. next_block['cursor'] = cursor_current
  225. # - 'type'
  226. next_block['type'] = m.group('block_type').strip()
  227. # - 'start' detected earlier
  228. # Potential function name detected...
  229. elif m.group('fn_name') != None:
  230. # ... if outside of a function (do not detect enclosed functions, unless classes are matched)
  231. # wander why 'or next_block['type'] != 'function'' is in the condition?
  232. # - remove it, run the tests and will see
  233. if blocks[curblk]['type'] != 'function' and (next_block['name'] == "" or next_block['type'] != 'function'):
  234. # - 'name'
  235. next_block['name'] = m.group('fn_name').strip()
  236. # - 'cursor'
  237. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('fn_name')))
  238. cursor_last_pos = m.start('fn_name')
  239. # NOTE: cursor could be collected together with line_begin, line_end,
  240. # but we keep it here separately for easier debugging of file parsing problems
  241. next_block['cursor'] = cursor_current
  242. # - 'type'
  243. next_block['type'] = 'function'
  244. # - 'start' detected earlier
  245. else:
  246. assert(len("Unknown match by regular expression") == 0)
  247. while indent_current > 0:
  248. # log all
  249. mpp.cout.notify(data.get_path(),
  250. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, len(text))),
  251. mpp.cout.SEVERITY_WARNING,
  252. "Non-matching opening bracket '{' detected.")
  253. count_mismatched_brackets += 1
  254. indent_current -= 1
  255. for (ind, each) in enumerate(blocks):
  256. each = each # used
  257. block = blocks[len(blocks) - 1 - ind]
  258. self.finalize_block(text, block, len(text))
  259. self.add_lines_data(text, blocks)
  260. self.add_regions(data, blocks)
  261. return count_mismatched_brackets