cs.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. #
  2. # Metrix++, Copyright 2009-2019, Metrix++ Project
  3. # Link: https://github.com/metrixplusplus/metrixplusplus
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. import re
  8. import binascii
  9. import mpp.api
  10. import mpp.cout
  11. class Plugin(mpp.api.Plugin, mpp.api.Parent, mpp.api.IParser, mpp.api.IConfigurable, mpp.api.ICode):
  12. def declare_configuration(self, parser):
  13. parser.add_option("--std.code.cs.files", default="*.cs",
  14. help="Enumerates filename extensions to match C# files [default: %default]")
  15. def configure(self, options):
  16. self.files = options.__dict__['std.code.cs.files'].split(',')
  17. self.files.sort() # sorted list goes to properties
  18. def initialize(self):
  19. mpp.api.Plugin.initialize(self, properties=[
  20. self.Property('files', ','.join(self.files))
  21. ])
  22. self.get_plugin('std.tools.collect').register_parser(self.files, self)
  23. def process(self, parent, data, is_updated):
  24. is_updated = is_updated or self.is_updated
  25. count_mismatched_brackets = 0
  26. if is_updated == True:
  27. count_mismatched_brackets = CsCodeParser().run(data)
  28. self.notify_children(data, is_updated)
  29. return count_mismatched_brackets
  30. class CsCodeParser(object):
  31. regex_cpp = re.compile(r'''
  32. //(?=\n|\r\n|\r) # Match C# style comments (empty comment line)
  33. | //.*?(?=\n|\r\n|\r) # Match C# style comments
  34. # NOTE: end of line is NOT consumed
  35. # NOTE: it is slightly different in C++
  36. | /\*\*/ # Match C style comments (empty comment line)
  37. # NOTE: it is slightly different in C++
  38. | /\*.*?\*/ # Match C style comments
  39. # NOTE: it is slightly different in C++
  40. | \'(?:\\.|[^\\\'])*\' # Match quoted strings
  41. | "(?:\\.|[^\\"])*" # Match double quoted strings
  42. | (((?<=\n|\r)|^)[ \t]*[#].*?(?=\n|\r\n|\r)) # Match preprocessor
  43. # NOTE: end of line is NOT consumed
  44. # NOTE: beginning of line is NOT consumed
  45. # NOTE: C# does not support backslashing as C++ does
  46. | (?P<fn_name>
  47. (operator( # Match C# operator ...
  48. (\s+[_a-zA-Z][_a-zA-Z0-9]*(\s*\[\s*\])?) # - cast, true, false operators
  49. | (\s*\[\s*\]) # - operator []
  50. | (\s*\(\s*\)) # - operator ()
  51. | (\s*[+-\\*/=<>!%&^|~,?.]{1,3}) # - other operators (from 1 to 3 symbols)
  52. # NOTE: maybe dot and ? should not be in the list...
  53. ))
  54. | (([~]\s*)?[_a-zA-Z][_a-zA-Z0-9]* # ... or function or constructor
  55. (\s*[<]\s*[_a-zA-Z0-9]+\s*([,]\s*[_a-zA-Z0-9]+\s*)*[>])? # NOTE: takes care of generics with multiple parameters
  56. (\s*[.]\s*[a-zA-Z_][a-zA-Z0-9_]*
  57. (\s*[<]\s*[_a-zA-Z0-9]+\s*([,]\s*[_a-zA-Z0-9]+\s*)*[>])?)*) # NOTE: takes care of generics with multiple parameters
  58. # NOTE: C# destructor can have spaces in name after ~
  59. # NOTE: explicit interface implementation method has got a dot
  60. | (?P<prop_setget>get|set) # ... or property setter/getter
  61. )\s*(?(prop_setget)(?=[{])|[(])
  62. # LIMITATION: if there are comments after function name
  63. # and before '(', it is not detected
  64. # LIMITATION: if there are comments within operator definition,
  65. # if may be not detected
  66. # LIMITATION: if there are comments after set|get keyword,
  67. # if may be not detected
  68. | ((?P<block_type>\bclass|\bstruct|\bnamespace|\binterface) # Match class or struct or interface or namespace
  69. (?P<block_name>(\s+[a-zA-Z_][a-zA-Z0-9_]*
  70. (\s*[<]\s*[_a-zA-Z0-9]+\s*([,]\s*[_a-zA-Z0-9]+\s*)*[>])? # NOTE: takes care of generics with multiple parameters
  71. )))
  72. # NOTE: noname instances are impossible in C#
  73. # LIMITATION: if there are comments between keyword and name,
  74. # it is not detected
  75. | [\[\]{};] # Match block start/end and statement separator
  76. # NOTE: C++ parser includes processing of <> and :
  77. # to handle template definitions, it is easier in C#
  78. | ((?:\n|\r\n|\r)\s*(?:\n|\r\n|\r)) # Match double empty line
  79. ''',
  80. re.DOTALL | re.MULTILINE | re.VERBOSE
  81. )
  82. # \r\n goes before \r in order to consume right number of lines on Unix for Windows files
  83. regex_ln = re.compile(r'(\n)|(\r\n)|(\r)')
  84. def run(self, data):
  85. self.__init__() # Go to initial state if it is called twice
  86. return self.parse(data)
  87. def finalize_block(self, text, block, block_end):
  88. if block['type'] != '__global__':
  89. # do not trim spaces for __global__region
  90. space_match = re.match('^\s*', text[block['start']:block_end], re.MULTILINE)
  91. block['start'] += space_match.end() # trim spaces at the beginning
  92. block['end'] = block_end
  93. start_pos = block['start']
  94. crc32 = 0
  95. for child in block['children']:
  96. # exclude children
  97. crc32 = binascii.crc32(text[start_pos:child['start']].encode('utf8'), crc32)
  98. start_pos = child['end']
  99. block['checksum'] = binascii.crc32(text[start_pos:block['end']].encode('utf8'), crc32) & 0xffffffff # to match python 3
  100. def add_lines_data(self, text, blocks):
  101. def add_lines_data_rec(self, text, blocks):
  102. for each in blocks:
  103. # add line begin
  104. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['start']))
  105. each['line_begin'] = self.total_current
  106. self.total_last_pos = each['start']
  107. # process enclosed
  108. add_lines_data_rec(self, text, each['children'])
  109. # add line end
  110. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['end']))
  111. each['line_end'] = self.total_current
  112. self.total_last_pos = each['end']
  113. self.total_last_pos = 0
  114. self.total_current = 1
  115. add_lines_data_rec(self, text, blocks)
  116. def add_regions(self, data, blocks):
  117. # Note: data.add_region() internals depend on special ordering of regions
  118. # in order to identify enclosed regions efficiently
  119. def add_regions_rec(self, data, blocks):
  120. def get_type_id(data, named_type):
  121. if named_type == "function":
  122. return mpp.api.Region.T.FUNCTION
  123. elif named_type == "class":
  124. return mpp.api.Region.T.CLASS
  125. elif named_type == "struct":
  126. return mpp.api.Region.T.STRUCT
  127. elif named_type == "namespace":
  128. return mpp.api.Region.T.NAMESPACE
  129. elif named_type == "interface":
  130. return mpp.api.Region.T.INTERFACE
  131. elif named_type == "__global__":
  132. return mpp.api.Region.T.GLOBAL
  133. else:
  134. assert(False)
  135. for each in blocks:
  136. data.add_region(each['name'], each['start'], each['end'],
  137. each['line_begin'], each['line_end'], each['cursor'],
  138. get_type_id(data, each['type']), each['checksum'])
  139. add_regions_rec(self, data, each['children'])
  140. add_regions_rec(self, data, blocks)
  141. def parse(self, data):
  142. def reset_next_block(start):
  143. return {'name':'', 'start':start, 'cursor':0, 'type':'', 'inside_attribute':False}
  144. count_mismatched_brackets = 0
  145. text = data.get_content()
  146. indent_current = 0;
  147. blocks = [{'name':'__global__', 'start':0, 'cursor':0, 'type':'__global__', 'indent_start':indent_current, 'children':[]}]
  148. curblk = 0
  149. next_block = reset_next_block(0)
  150. cursor_last_pos = 0
  151. cursor_current = 1
  152. for m in re.finditer(self.regex_cpp, text):
  153. # Comment
  154. if text[m.start()] == '/':
  155. data.add_marker(m.start(), m.end(), mpp.api.Marker.T.COMMENT)
  156. # String
  157. elif text[m.start()] == '"' or text[m.start()] == '\'':
  158. data.add_marker(m.start() + 1, m.end() - 1, mpp.api.Marker.T.STRING)
  159. # Preprocessor (including internal comments)
  160. elif text[m.start()] == ' ' or text[m.start()] == '\t' or text[m.start()] == '#':
  161. data.add_marker(m.start(), m.end(), mpp.api.Marker.T.PREPROCESSOR)
  162. # Statement end
  163. elif text[m.start()] == ';':
  164. # Reset next block name and start
  165. next_block['name'] = ""
  166. next_block['start'] = m.end() # potential region start
  167. # Block openned by '[' bracket...
  168. elif text[m.start()] == '[':
  169. # ... may include attributes, so do not capture function names inside
  170. next_block['inside_attribute'] = True
  171. # Block closed by ']' bracket...
  172. # note: do not care about nesting for simplicity -
  173. # because attribute's statement can not have symbol ']' inside
  174. elif text[m.start()] == ']':
  175. # ... may include attributes, so do not capture function names inside
  176. next_block['inside_attribute'] = False
  177. # Double end line
  178. elif text[m.start()] == '\n' or text[m.start()] == '\r':
  179. # Reset next block start, if has not been named yet
  180. if next_block['name'] == "":
  181. next_block['start'] = m.end() # potential region start
  182. # Block start...
  183. elif text[m.start()] == '{':
  184. # shift indent right
  185. indent_current += 1
  186. # ... if name detected previously
  187. if next_block['name'] != '': # - Start of enclosed block
  188. blocks.append({'name':next_block['name'],
  189. 'start':next_block['start'],
  190. 'cursor':next_block['cursor'],
  191. 'type':next_block['type'],
  192. 'indent_start':indent_current,
  193. 'children':[]})
  194. next_block = reset_next_block(m.end())
  195. curblk += 1
  196. # ... reset next block start, otherwise
  197. else: # - unknown type of block start
  198. next_block['start'] = m.end() # potential region start
  199. # Block end...
  200. elif text[m.start()] == '}':
  201. # ... if indent level matches the start
  202. if blocks[curblk]['indent_start'] == indent_current:
  203. next_block = reset_next_block(m.end())
  204. if curblk == 0:
  205. mpp.cout.notify(data.get_path(),
  206. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start())),
  207. mpp.cout.SEVERITY_WARNING,
  208. "Non-matching closing bracket '}' detected.")
  209. count_mismatched_brackets += 1
  210. continue
  211. self.finalize_block(text, blocks[curblk], m.end())
  212. assert(blocks[curblk]['type'] != '__global__')
  213. curblk -= 1
  214. assert(curblk >= 0)
  215. blocks[curblk]['children'].append(blocks.pop())
  216. # shift indent left
  217. indent_current -= 1
  218. if indent_current < 0:
  219. mpp.cout.notify(data.get_path(),
  220. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start())),
  221. mpp.cout.SEVERITY_WARNING,
  222. "Non-matching closing bracket '}' detected.")
  223. count_mismatched_brackets += 1
  224. indent_current = 0
  225. # Potential namespace, struct, class, interface
  226. elif m.group('block_type') != None:
  227. if next_block['name'] == "":
  228. # - 'name'
  229. clearance_pattern = re.compile(r'\s+')
  230. next_block['name'] = clearance_pattern.sub('',m.group('block_name'))
  231. # - 'cursor'
  232. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('block_name')))
  233. cursor_last_pos = m.start('block_name')
  234. next_block['cursor'] = cursor_current
  235. # - 'type'
  236. next_block['type'] = m.group('block_type').strip()
  237. # - 'start' detected earlier
  238. # Potential function name detected...
  239. elif m.group('fn_name') != None:
  240. # ... if outside of a function
  241. # (do not detect functions enclosed directly in a function, i.e. without classes)
  242. # ... and other name before has not been matched
  243. if blocks[curblk]['type'] != 'function' and (next_block['name'] == "") \
  244. and next_block['inside_attribute'] == False:
  245. # - 'name'
  246. clearance_pattern = re.compile(r'\s+')
  247. next_block['name'] = clearance_pattern.sub('', m.group('fn_name'))
  248. # - 'cursor'
  249. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('fn_name')))
  250. cursor_last_pos = m.start('fn_name')
  251. # NOTE: cursor could be collected together with line_begin, line_end,
  252. # but we keep it here separately for easier debugging of file parsing problems
  253. next_block['cursor'] = cursor_current
  254. # - 'type'
  255. next_block['type'] = 'function'
  256. # - 'start' detected earlier
  257. else:
  258. assert(len("Unknown match by regular expression") == 0)
  259. while indent_current > 0:
  260. # log all
  261. mpp.cout.notify(data.get_path(),
  262. cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, len(text))),
  263. mpp.cout.SEVERITY_WARNING,
  264. "Non-matching opening bracket '{' detected.")
  265. count_mismatched_brackets += 1
  266. indent_current -= 1
  267. for (ind, each) in enumerate(blocks):
  268. each = each # used
  269. block = blocks[len(blocks) - 1 - ind]
  270. self.finalize_block(text, block, len(text))
  271. self.add_lines_data(text, blocks)
  272. self.add_regions(data, blocks)
  273. return count_mismatched_brackets