cs.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. #
  2. # Metrix++, Copyright 2009-2013, Metrix++ Project
  3. # Link: http://metrixplusplus.sourceforge.net
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. # Metrix++ is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, version 3 of the License.
  10. #
  11. # Metrix++ is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with Metrix++. If not, see <http://www.gnu.org/licenses/>.
  18. #
  19. import re
  20. import binascii
  21. import logging
  22. import core.api
  23. class Plugin(core.api.Plugin, core.api.Parent, core.api.IParser, core.api.IConfigurable, core.api.ICode):
  24. def declare_configuration(self, parser):
  25. parser.add_option("--std.code.cs.files", default="*.cs",
  26. help="Enumerates filename extensions to match C# files [default: %default]")
  27. def configure(self, options):
  28. self.files = options.__dict__['std.code.cs.files'].split(',')
  29. self.files.sort() # sorted list goes to properties
  30. def initialize(self):
  31. # trigger version property set
  32. core.api.Plugin.initialize(self)
  33. db_loader = self.get_plugin_loader().get_database_loader()
  34. prev_ext = db_loader.set_property(self.get_name() + ":files", ','.join(self.files))
  35. if prev_ext != ','.join(self.files):
  36. self.is_updated = True
  37. self.get_plugin_loader().register_parser(self.files, self)
  38. def process(self, parent, data, is_updated):
  39. is_updated = is_updated or self.is_updated
  40. count_mismatched_brackets = 0
  41. if is_updated == True:
  42. count_mismatched_brackets = CsCodeParser().run(data)
  43. self.notify_children(data, is_updated)
  44. return count_mismatched_brackets
  45. class CsCodeParser(object):
  46. regex_cpp = re.compile(r'''
  47. //(?=\n|\r|\r\n) # Match C# style comments (empty comment line)
  48. | //.*?(?=\n|\r|\r\n) # Match C# style comments
  49. # NOTE: end of line is NOT consumed
  50. # NOTE: it is slightly different in C++
  51. | /\*\*/ # Match C style comments (empty comment line)
  52. # NOTE: it is slightly different in C++
  53. | /\*.*?\*/ # Match C style comments
  54. # NOTE: it is slightly different in C++
  55. | \'(?:\\.|[^\\\'])*\' # Match quoted strings
  56. | "(?:\\.|[^\\"])*" # Match double quoted strings
  57. | (((?<=\n|\r)|^)[ \t]*[#].*?(?=\n|\r|\r\n)) # Match preprocessor
  58. # NOTE: end of line is NOT consumed
  59. # NOTE: beginning of line is NOT consumed
  60. # NOTE: C# does not support backslashing as C++ does
  61. | (?P<fn_name>
  62. (operator( # Match C# operator ...
  63. (\s+[_a-zA-Z][_a-zA-Z0-9]*(\s*\[\s*\])?) # - cast, true, false operators
  64. | (\s*\[\s*\]) # - operator []
  65. | (\s*\(\s*\)) # - operator ()
  66. | (\s*[+-\\*/=<>!%&^|~,?.]{1,3}) # - other operators (from 1 to 3 symbols)
  67. # NOTE: maybe dot and ? should not be in the list...
  68. ))
  69. | (([~]\s*)?[_a-zA-Z][_a-zA-Z0-9]*
  70. ([.][a-zA-Z_][a-zA-Z0-9_]*)*) # ... or function or constructor
  71. # NOTE: C# destructor can have spaces in name after ~
  72. # NOTE: explicit interface implementation method has got a dot
  73. | (?P<prop_setget>get|set) # ... or property setter/getter
  74. )\s*(?(prop_setget)(?=[{])|[(<])
  75. # LIMITATION: if there are comments after function name
  76. # and before '(', it is not detected
  77. # LIMITATION: if there are comments within operator definition,
  78. # if may be not detected
  79. # LIMITATION: if there are comments after set|get keyword,
  80. # if may be not detected
  81. | ((?P<block_type>class|struct|namespace|interface) # Match class or struct or interface or namespace
  82. (?P<block_name>(\s+[a-zA-Z_][a-zA-Z0-9_]*)([.][a-zA-Z_][a-zA-Z0-9_]*)*))
  83. # NOTE: noname instances are impossible in C#
  84. # NOTE: names can have sub-names separated by dots
  85. # LIMITATION: if there are comments between keyword and name,
  86. # it is not detected
  87. | [\[\]{};] # Match block start/end and statement separator
  88. # NOTE: C++ parser includes processing of <> and :
  89. # to handle template definitions, it is easier in C#
  90. | ((?:\n|\r|\r\n)\s*(?:\n|\r|\r\n)) # Match double empty line
  91. ''',
  92. re.DOTALL | re.MULTILINE | re.VERBOSE
  93. )
  94. regex_ln = re.compile(r'(\n)|(\r)|(\r\n)')
  95. def run(self, data):
  96. self.__init__() # Go to initial state if it is called twice
  97. return self.parse(data)
  98. def finalize_block(self, text, block, block_end):
  99. space_match = re.match('^\s*', text[block['start']:block_end], re.MULTILINE)
  100. block['start'] += space_match.end() # trim spaces at the beginning
  101. block['end'] = block_end
  102. start_pos = block['start']
  103. crc32 = 0
  104. for child in block['children']:
  105. # exclude children
  106. crc32 = binascii.crc32(text[start_pos:child['start']], crc32)
  107. start_pos = child['end']
  108. block['checksum'] = binascii.crc32(text[start_pos:block['end']], crc32) & 0xffffffff # to match python 3
  109. def add_lines_data(self, text, blocks):
  110. def add_lines_data_rec(self, text, blocks):
  111. for each in blocks:
  112. # add line begin
  113. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['start']))
  114. each['line_begin'] = self.total_current
  115. self.total_last_pos = each['start']
  116. # process enclosed
  117. add_lines_data_rec(self, text, each['children'])
  118. # add line end
  119. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['end']))
  120. each['line_end'] = self.total_current
  121. self.total_last_pos = each['end']
  122. self.total_last_pos = 0
  123. self.total_current = 1
  124. add_lines_data_rec(self, text, blocks)
  125. def add_regions(self, data, blocks):
  126. # Note: data.add_region() internals depend on special ordering of regions
  127. # in order to identify enclosed regions efficiently
  128. def add_regions_rec(self, data, blocks):
  129. def get_type_id(data, named_type):
  130. if named_type == "function":
  131. return data.get_region_types().FUNCTION
  132. elif named_type == "class":
  133. return data.get_region_types().CLASS
  134. elif named_type == "struct":
  135. return data.get_region_types().STRUCT
  136. elif named_type == "namespace":
  137. return data.get_region_types().NAMESPACE
  138. elif named_type == "interface":
  139. return data.get_region_types().INTERFACE
  140. elif named_type == "__global__":
  141. return data.get_region_types().GLOBAL
  142. else:
  143. assert(False)
  144. for each in blocks:
  145. data.add_region(each['name'], each['start'], each['end'],
  146. each['line_begin'], each['line_end'], each['cursor'],
  147. get_type_id(data, each['type']), each['checksum'])
  148. add_regions_rec(self, data, each['children'])
  149. add_regions_rec(self, data, blocks)
  150. def parse(self, data):
  151. def reset_next_block(start):
  152. return {'name':'', 'start':start, 'cursor':0, 'type':'', 'inside_attribute':False}
  153. count_mismatched_brackets = 0
  154. text = data.get_content()
  155. indent_current = 0;
  156. blocks = [{'name':'__global__', 'start':0, 'cursor':0, 'type':'__global__', 'indent_start':indent_current, 'children':[]}]
  157. curblk = 0
  158. next_block = reset_next_block(0)
  159. cursor_last_pos = 0
  160. cursor_current = 1
  161. for m in re.finditer(self.regex_cpp, text):
  162. # Comment
  163. if text[m.start()] == '/':
  164. data.add_marker(m.start(), m.end(), data.get_marker_types().COMMENT)
  165. if text[m.start():m.end()].startswith("//\n"):
  166. print text[m.start():m.end()]
  167. # String
  168. elif text[m.start()] == '"' or text[m.start()] == '\'':
  169. data.add_marker(m.start() + 1, m.end() - 1, data.get_marker_types().STRING)
  170. # Preprocessor (including internal comments)
  171. elif text[m.start()] == ' ' or text[m.start()] == '\t' or text[m.start()] == '#':
  172. data.add_marker(m.start(), m.end(), data.get_marker_types().PREPROCESSOR)
  173. # Statement end
  174. elif text[m.start()] == ';':
  175. # Reset next block name and start
  176. next_block['name'] = ""
  177. next_block['start'] = m.end() # potential region start
  178. # Block openned by '[' bracket...
  179. elif text[m.start()] == '[':
  180. # ... may include attributes, so do not capture function names inside
  181. next_block['inside_attribute'] = True
  182. # Block closed by ']' bracket...
  183. # note: do not care about nesting for simplicity -
  184. # because attribute's statement can not have symbol ']' inside
  185. elif text[m.start()] == ']':
  186. # ... may include attributes, so do not capture function names inside
  187. next_block['inside_attribute'] = False
  188. # Double end line
  189. elif text[m.start()] == '\n' or text[m.start()] == '\r':
  190. # Reset next block start, if has not been named yet
  191. if next_block['name'] == "":
  192. next_block['start'] = m.end() # potential region start
  193. # Block start...
  194. elif text[m.start()] == '{':
  195. # shift indent right
  196. indent_current += 1
  197. # ... if name detected previously
  198. if next_block['name'] != '': # - Start of enclosed block
  199. blocks.append({'name':next_block['name'],
  200. 'start':next_block['start'],
  201. 'cursor':next_block['cursor'],
  202. 'type':next_block['type'],
  203. 'indent_start':indent_current,
  204. 'children':[]})
  205. next_block = reset_next_block(m.end())
  206. curblk += 1
  207. # ... reset next block start, otherwise
  208. else: # - unknown type of block start
  209. next_block['start'] = m.end() # potential region start
  210. # Block end...
  211. elif text[m.start()] == '}':
  212. # ... if indent level matches the start
  213. if blocks[curblk]['indent_start'] == indent_current:
  214. next_block = reset_next_block(m.end())
  215. if curblk == 0:
  216. logging.warning("Non-matching closing bracket '}' detected: " + data.get_path() + ":" +
  217. str(cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start()))))
  218. count_mismatched_brackets += 1
  219. continue
  220. self.finalize_block(text, blocks[curblk], m.end())
  221. assert(blocks[curblk]['type'] != '__global__')
  222. curblk -= 1
  223. assert(curblk >= 0)
  224. blocks[curblk]['children'].append(blocks.pop())
  225. # shift indent left
  226. indent_current -= 1
  227. if indent_current < 0:
  228. logging.warning("Non-matching closing bracket '}' detected")
  229. count_mismatched_brackets += 1
  230. indent_current = 0
  231. # Potential namespace, struct, class, interface
  232. elif m.group('block_type') != None:
  233. if next_block['name'] == "":
  234. # - 'name'
  235. next_block['name'] = m.group('block_name').strip()
  236. # - 'cursor'
  237. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('block_name')))
  238. cursor_last_pos = m.start('block_name')
  239. next_block['cursor'] = cursor_current
  240. # - 'type'
  241. next_block['type'] = m.group('block_type').strip()
  242. # - 'start' detected earlier
  243. # Potential function name detected...
  244. elif m.group('fn_name') != None:
  245. # ... if outside of a function
  246. # (do not detect functions enclosed directly in a function, i.e. without classes)
  247. # ... and other name before has not been matched
  248. if blocks[curblk]['type'] != 'function' and (next_block['name'] == "") \
  249. and next_block['inside_attribute'] == False:
  250. # - 'name'
  251. next_block['name'] = m.group('fn_name').strip()
  252. # - 'cursor'
  253. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('fn_name')))
  254. cursor_last_pos = m.start('fn_name')
  255. # NOTE: cursor could be collected together with line_begin, line_end,
  256. # but we keep it here separately for easier debugging of file parsing problems
  257. next_block['cursor'] = cursor_current
  258. # - 'type'
  259. next_block['type'] = 'function'
  260. # - 'start' detected earlier
  261. else:
  262. assert(len("Unknown match by regular expression") == 0)
  263. while indent_current > 0:
  264. # log all
  265. logging.warning("Non-matching opening bracket '{' detected")
  266. count_mismatched_brackets += 1
  267. indent_current -= 1
  268. for (ind, each) in enumerate(blocks):
  269. each = each # used
  270. block = blocks[len(blocks) - 1 - ind]
  271. self.finalize_block(text, block, len(text))
  272. self.add_lines_data(text, blocks)
  273. self.add_regions(data, blocks)
  274. return count_mismatched_brackets