cpp.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. '''
  2. Created on 26/06/2012
  3. @author: konstaa
  4. '''
  5. import fnmatch
  6. import re
  7. import binascii
  8. import logging
  9. import core.api
  10. class Plugin(core.api.Plugin, core.api.Parent, core.api.Child, core.api.IConfigurable, core.api.ICode):
  11. def declare_configuration(self, parser):
  12. parser.add_option("--std.code.cpp.files", default="*.c,*.cpp,*.h,*.hpp",
  13. help="Enumerates filename extensions to match C/C++ files [default: %default]")
  14. def configure(self, options):
  15. self.files = options.__dict__['std.code.cpp.files'].split(',')
  16. def initialize(self):
  17. core.api.subscribe_by_parents_name('core.dir', self)
  18. namespace = self.get_plugin_loader().get_database_loader().create_namespace(self.get_name())
  19. namespace.add_field('files', int)
  20. namespace.add_field('mismatched_brackets', None)
  21. def callback(self, parent, data):
  22. for ext in self.files:
  23. if fnmatch.fnmatch(data.get_path(), ext):
  24. if data.get_data(self.get_name(), 'files') != None:
  25. self.notify_children(data)
  26. return
  27. count_mismatched_brackets = CppCodeParser().run(data)
  28. data.set_data(self.get_name(), 'files', 1)
  29. data.set_data(self.get_name(), 'mismatched_brackets', count_mismatched_brackets)
  30. self.notify_children(data)
  31. break
  32. class CppCodeParser(object):
  33. regex_cpp = re.compile(r'''
  34. /([\\](?:\n|\r|\r\n))*/(?=\n|\r|\r\n) # Match C++ style comments (empty comment line)
  35. | /([\\](?:\n|\r|\r\n))*/.*?[^\\](?=\n|\r|\r\n) # Match C++ style comments
  36. # NOTE: end of line is NOT consumed
  37. # NOTE: ([\\](?:\n|\r|\r\n))* for new line separators,
  38. # Need to support new line separators in expense of efficiency?
  39. | /([\\](?:\n|\r|\r\n))*\*.*?\*([\\](?:\n|\r|\r\n))*/ # Match C style comments
  40. | \'(?:\\.|[^\\\'])*\' # Match quoted strings
  41. | "(?:\\.|[^\\"])*" # Match double quoted strings
  42. | (((?<=\n|\r)|^)[ \t]*[#].*?[^\\](?=\n|\r|\r\n)) # Match preprocessor
  43. # NOTE: end of line is NOT consumed
  44. # NOTE: beginning of line is NOT consumed
  45. | (?P<fn_name>
  46. (operator( # Match C++ operator ...
  47. (\s+[_a-zA-Z][_a-zA-Z0-9]*(\s*\[\s*\])?) # - cast, new and delete operators
  48. | (\s*\[\s*\]) # - operator []
  49. | (\s*\(\s*\)) # - operator ()
  50. | (\s*[+-\\*/=<>!%&^|~,?.]{1,3}) # - other operators (from 1 to 3 symbols)
  51. ))
  52. | ([~]?[_a-zA-Z][_a-zA-Z0-9]*) # ... or function or constructor
  53. )\s*[(] # LIMITATION: if there are comments after function name
  54. # and before '(', it is not detected
  55. # LIMITATION: if there are comments within operator definition,
  56. # if may be not detected
  57. | ((?P<block_type>class|struct|namespace) # Match C++ class or struct
  58. (?P<block_name>((\s+[a-zA-Z_][a-zA-Z0-9_]*)|(?=\s*[{])))) # noname is supported, symbol '{' is not consumed
  59. # LIMITATION: if there are comments between keyword and name,
  60. # it is not detected
  61. | [<>{};:] # Match block start/end, brackets and statement separator
  62. | ((?:\n|\r|\r\n)\s*(?:\n|\r|\r\n)) # Match double empty line
  63. ''',
  64. re.DOTALL | re.MULTILINE | re.VERBOSE
  65. )
  66. regex_ln = re.compile(r'(\n)|(\r)|(\r\n)')
  67. def run(self, data):
  68. self.__init__() # Go to initial state if it is called twice
  69. return self.parse(data)
  70. def finalize_block(self, text, block, block_end):
  71. space_match = re.match('^\s*', text[block['start']:block_end], re.MULTILINE)
  72. block['start'] += space_match.end() # trim spaces at the beginning
  73. block['end'] = block_end
  74. start_pos = block['start']
  75. crc32 = 0
  76. for child in block['children']:
  77. # exclude children
  78. crc32 = binascii.crc32(text[start_pos:child['start']], crc32)
  79. start_pos = child['end']
  80. block['checksum'] = binascii.crc32(text[start_pos:block['end']], crc32) & 0xffffffff # to match python 3
  81. def add_lines_data(self, text, blocks):
  82. def add_lines_data_rec(self, text, blocks):
  83. for each in blocks:
  84. # add line begin
  85. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['start']))
  86. each['line_begin'] = self.total_current
  87. self.total_last_pos = each['start']
  88. # process enclosed
  89. add_lines_data_rec(self, text, each['children'])
  90. # add line end
  91. self.total_current += len(self.regex_ln.findall(text, self.total_last_pos, each['end']))
  92. each['line_end'] = self.total_current
  93. self.total_last_pos = each['end']
  94. self.total_last_pos = 0
  95. self.total_current = 1
  96. add_lines_data_rec(self, text, blocks)
  97. def add_regions(self, data, blocks):
  98. # Note: data.add_region() internals depend on special ordering of regions
  99. # in order to identify enclosed regions efficiently
  100. def add_regions_rec(self, data, blocks):
  101. def get_type_id(data, named_type):
  102. if named_type == "function":
  103. return data.get_region_types().FUNCTION
  104. elif named_type == "class":
  105. return data.get_region_types().CLASS
  106. elif named_type == "struct":
  107. return data.get_region_types().STRUCT
  108. elif named_type == "namespace":
  109. return data.get_region_types().NAMESPACE
  110. elif named_type == "__global__":
  111. return data.get_region_types().GLOBAL
  112. else:
  113. assert(False)
  114. for each in blocks:
  115. data.add_region(each['name'], each['start'], each['end'],
  116. each['line_begin'], each['line_end'], each['cursor'],
  117. get_type_id(data, each['type']), each['checksum'])
  118. add_regions_rec(self, data, each['children'])
  119. add_regions_rec(self, data, blocks)
  120. def parse(self, data):
  121. def reset_next_block(start):
  122. return {'name':'', 'start':start, 'cursor':0, 'type':'', 'confirmed':False}
  123. count_mismatched_brackets = 0
  124. text = data.get_content()
  125. indent_current = 0;
  126. blocks = [{'name':'__global__', 'start':0, 'cursor':0, 'type':'__global__', 'indent_start':indent_current, 'children':[]}]
  127. curblk = 0
  128. next_block = reset_next_block(0)
  129. cursor_last_pos = 0
  130. cursor_current = 1
  131. for m in re.finditer(self.regex_cpp, text):
  132. # Comment
  133. if text[m.start()] == '/':
  134. data.add_marker(m.start(), m.end(), data.get_marker_types().COMMENT)
  135. if text[m.start():m.end()].startswith("//\n"):
  136. print text[m.start():m.end()]
  137. # String
  138. elif text[m.start()] == '"' or text[m.start()] == '\'':
  139. data.add_marker(m.start() + 1, m.end() - 1, data.get_marker_types().STRING)
  140. # Preprocessor (including internal comments)
  141. elif text[m.start()] == ' ' or text[m.start()] == '\t' or text[m.start()] == '#':
  142. data.add_marker(m.start(), m.end(), data.get_marker_types().PREPROCESSOR)
  143. # Statement end
  144. elif text[m.start()] == ';':
  145. # Reset next block name and start
  146. next_block['name'] = ""
  147. next_block['start'] = m.end() # potential region start
  148. # Template argument closing bracket
  149. elif text[m.start()] == '>':
  150. # Reset next block name and start (in order to skip class names in templates), if has not been confirmed before
  151. if next_block['confirmed'] == False and (next_block['type'] == 'class' or next_block['type'] == 'struct'):
  152. next_block['name'] = ""
  153. next_block['start'] = m.end() # potential region start
  154. # Template argument opening bracket or after class inheritance specification
  155. elif text[m.start()] == ':' or text[m.start()] == '<':
  156. # .. if goes after calss definition
  157. if next_block['type'] == 'class' or next_block['type'] == 'struct':
  158. next_block['confirmed'] = True
  159. # Double end line
  160. elif text[m.start()] == '\n' or text[m.start()] == '\r':
  161. # Reset next block start, if has not been named yet
  162. if next_block['name'] == "":
  163. next_block['start'] = m.end() # potential region start
  164. # Block start...
  165. elif text[m.start()] == '{':
  166. # shift indent right
  167. indent_current += 1
  168. # ... if name detected previously
  169. if next_block['name'] != '': # - Start of enclosed block
  170. blocks.append({'name':next_block['name'],
  171. 'start':next_block['start'],
  172. 'cursor':next_block['cursor'],
  173. 'type':next_block['type'],
  174. 'indent_start':indent_current,
  175. 'children':[]})
  176. next_block = reset_next_block(m.end())
  177. curblk += 1
  178. # ... reset next block start, otherwise
  179. else: # - unknown type of block start
  180. next_block['start'] = m.end() # potential region start
  181. # Block end...
  182. elif text[m.start()] == '}':
  183. # ... if indent level matches the start
  184. if blocks[curblk]['indent_start'] == indent_current:
  185. next_block = reset_next_block(m.end())
  186. if curblk == 0:
  187. print data.get_path()
  188. print cursor_current + len(self.regex_ln.findall(text, cursor_last_pos, m.start()))
  189. logging.warning("Non-matching closing bracket '}' detected")
  190. count_mismatched_brackets += 1
  191. continue
  192. self.finalize_block(text, blocks[curblk], m.end())
  193. assert(blocks[curblk]['type'] != '__global__')
  194. curblk -= 1
  195. assert(curblk >= 0)
  196. blocks[curblk]['children'].append(blocks.pop())
  197. # shift indent left
  198. indent_current -= 1
  199. if indent_current < 0:
  200. logging.warning("Non-matching closing bracket '}' detected")
  201. count_mismatched_brackets += 1
  202. indent_current = 0
  203. # Potential namespace, struct, class
  204. elif text[m.start():m.end()].startswith(('class','struct','namespace')) == True \
  205. and m.group('fn_name') == None: # function name can start with keyword, for example class_id_type()
  206. if next_block['name'] == "":
  207. # - 'name'
  208. next_block['name'] = m.group('block_name').strip()
  209. if next_block['name'] == "":
  210. next_block['name'] = '__noname__'
  211. # - 'cursor'
  212. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('block_name')))
  213. cursor_last_pos = m.start('block_name')
  214. next_block['cursor'] = cursor_current
  215. # - 'type'
  216. next_block['type'] = m.group('block_type').strip()
  217. # - 'start' detected earlier
  218. # Potential function name detected...
  219. else:
  220. # ... if outside of a function (do not detect enclosed functions, unless classes are matched)
  221. if blocks[curblk]['type'] != 'function' and (next_block['name'] == "" or next_block['type'] != 'function'):
  222. # - 'name'
  223. next_block['name'] = m.group('fn_name').strip()
  224. # - 'cursor'
  225. cursor_current += len(self.regex_ln.findall(text, cursor_last_pos, m.start('fn_name')))
  226. cursor_last_pos = m.start('fn_name')
  227. # NOTE: cursor could be collected together with line_begin, line_end,
  228. # but we keep it here separately for easier debugging of file parsing problems
  229. next_block['cursor'] = cursor_current
  230. # - 'type'
  231. next_block['type'] = 'function'
  232. # - 'start' detected earlier
  233. while indent_current > 0:
  234. # log all
  235. logging.warning("Non-matching opening bracket '{' detected")
  236. count_mismatched_brackets += 1
  237. indent_current -= 1
  238. for (ind, each) in enumerate(blocks):
  239. each = each # used
  240. block = blocks[len(blocks) - 1 - ind]
  241. self.finalize_block(text, block, len(text))
  242. self.add_lines_data(text, blocks)
  243. self.add_regions(data, blocks)
  244. return count_mismatched_brackets