collect.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. #
  2. # Metrix++, Copyright 2009-2019, Metrix++ Project
  3. # Link: https://github.com/metrixplusplus/metrixplusplus
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. from metrixpp.mpp import api
  8. import re
  9. import os
  10. import sys
  11. import logging
  12. import time
  13. import binascii
  14. import fnmatch
  15. import multiprocessing.pool
  16. class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
  17. def __init__(self):
  18. self.reader = DirectoryReader()
  19. self.include_rules = []
  20. self.exclude_rules = []
  21. self.exclude_dir_rules = []
  22. self.exclude_files = []
  23. self.parsers = []
  24. super(Plugin, self).__init__()
  25. def declare_configuration(self, parser):
  26. parser.add_option("--std.general.proctime", "--sgpt", action="store_true", default=False,
  27. help="If the option is set (True), the tool measures processing time per file [default: %default]")
  28. parser.add_option("--std.general.procerrors", "--sgpe", action="store_true", default=False,
  29. help="If the option is set (True), the tool counts number of processing/parsing errors per file [default: %default]")
  30. parser.add_option("--std.general.size", "--sgs", action="store_true", default=False,
  31. help="If the option is set (True), the tool collects file size metric (in bytes) [default: %default]")
  32. parser.add_option("--include-files", "--if", action='append',
  33. help="Adds a regular expression pattern to include files in processing (files have to match any rule to be included)")
  34. parser.add_option("--exclude-files", "--ef", action='append',
  35. help="Adds a regular expression pattern to exclude files or directories by name from processing")
  36. parser.add_option("--exclude-directories", "--ed", action='append',
  37. help="Adds a regular expression pattern to exclude directories by path from processing")
  38. parser.add_option("--non-recursively", "--nr", action="store_true", default=False,
  39. help="If the option is set (True), sub-directories are not processed [default: %default]")
  40. self.optparser = parser
  41. def configure(self, options):
  42. self.is_proctime_enabled = options.__dict__['std.general.proctime']
  43. self.is_procerrors_enabled = options.__dict__['std.general.procerrors']
  44. self.is_size_enabled = options.__dict__['std.general.size']
  45. # check if any include rule is given
  46. if options.__dict__['include_files']:
  47. try:
  48. for include_rule in options.__dict__['include_files']:
  49. self.add_include_rule(re.compile(include_rule))
  50. except Exception as e:
  51. self.optparser.error("option --include-files: " + str(e))
  52. else:
  53. self.add_include_rule(re.compile(r'.*'))
  54. # check if any exclude rule is given
  55. if options.__dict__['exclude_files']:
  56. try:
  57. for exclude_rule in options.__dict__['exclude_files']:
  58. self.add_exclude_rule(re.compile(exclude_rule))
  59. except Exception as e:
  60. self.optparser.error("option --exclude-files: " + str(e))
  61. else:
  62. self.add_exclude_rule(re.compile(r'^[.]'))
  63. # check if any exclude dir rule is given
  64. if options.__dict__['exclude_directories']:
  65. try:
  66. for exclude_dir_rule in options.__dict__['exclude_directories']:
  67. self.add_exclude_dir_rule(re.compile(exclude_dir_rule))
  68. except Exception as e:
  69. self.optparser.error("option --exclude-directories: " + str(e))
  70. self.non_recursively = options.__dict__['non_recursively']
  71. def initialize(self):
  72. fields = []
  73. if self.is_proctime_enabled == True:
  74. fields.append(self.Field('proctime', float))
  75. if self.is_procerrors_enabled == True:
  76. fields.append(self.Field('procerrors', int))
  77. if self.is_size_enabled == True:
  78. fields.append(self.Field('size', int))
  79. super(Plugin, self).initialize(namespace='std.general', support_regions=False, fields=fields)
  80. self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_path())
  81. self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_prev_path())
  82. def run(self, args):
  83. if len(args) == 0:
  84. return self.reader.run(self, "./")
  85. retcode = 0
  86. for directory in args:
  87. retcode += self.reader.run(self, directory)
  88. return retcode
  89. def register_parser(self, fnmatch_exp_list, parser):
  90. self.parsers.append((fnmatch_exp_list, parser))
  91. def get_parser(self, file_path):
  92. for parser in self.parsers:
  93. for fnmatch_exp in parser[0]:
  94. if fnmatch.fnmatch(file_path, fnmatch_exp):
  95. return parser[1]
  96. return None
  97. def add_include_rule(self, re_compiled_pattern):
  98. self.include_rules.append(re_compiled_pattern)
  99. def add_exclude_rule(self, re_compiled_pattern):
  100. self.exclude_rules.append(re_compiled_pattern)
  101. def add_exclude_dir_rule(self, re_compiled_pattern):
  102. self.exclude_dir_rules.append(re_compiled_pattern)
  103. def add_exclude_file(self, file_path):
  104. if file_path == None:
  105. return
  106. self.exclude_files.append(file_path)
  107. def is_file_excluded(self, file_name):
  108. # only apply the include rules to files - skip directories
  109. if os.path.isfile(file_name):
  110. for each in self.include_rules:
  111. if re.match(each, os.path.basename(file_name)) != None:
  112. break
  113. # file is excluded if no include rule matches
  114. else:
  115. logging.info("Excluding: " + file_name + " - not included by any rule")
  116. return True
  117. # check exclude dir rules for directories
  118. if os.path.isdir(file_name):
  119. for each in self.exclude_dir_rules:
  120. if re.match(each, file_name) != None:
  121. logging.info("Excluding: " + file_name + " - excluded by rule '" + each.pattern + "'")
  122. return True
  123. # check exclude rules for both, files and directories
  124. for each in self.exclude_rules:
  125. if re.match(each, os.path.basename(file_name)) != None:
  126. logging.info("Excluding: " + file_name + " - excluded by rule '" + each.pattern + "'")
  127. return True
  128. # finally check if a file is excluded directly
  129. for each in self.exclude_files:
  130. if os.path.basename(each) == os.path.basename(file_name):
  131. if os.stat(each) == os.stat(file_name):
  132. return True
  133. return False
  134. class DirectoryReader():
  135. def readtextfile(self,filename):
  136. """ Read a text file and try to detect the coding
  137. Since we examine program code text files we can assume the following:
  138. - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4
  139. byte, starting on 1, 2 or 4 byte boundaries (depending on
  140. 1, 2 or 4 byte coding)
  141. - There should at least one space (ASCII 0x20) char
  142. of the respective length (1,2 or 4 byte))
  143. - Program code consists of only ASCII chars, i.e. code < 128
  144. - Non ASCII chars should appear in string literals and comments only
  145. Though especially in the case of an 8 bit coding it does not matter
  146. which code page to use: Metric analysis is done on program code
  147. which is pure ASCII; string literals and comments are only recognized
  148. as such but not interpreted, though it doesn't matter if they contain
  149. non-ASCII chars whichever code page is used.
  150. Note the decoder's different behavior for the "utf_nn" identifiers:
  151. - .decode("utf_32") / .decode("utf_16"): preceding BOM is skipped
  152. - with suffix ".._be" or ".._le" respectively: preceding BOM is preserved
  153. but
  154. - .decode("utf_8"): preceding BOM is preserved
  155. - .decode("utf_8_sig"): preceding BOM is skipped
  156. """
  157. # Methods to check for various UTF variants without BOM:
  158. # Since UTF16/32 codings are recommended to use a BOM these methods
  159. # shouldn't be necessary but may be useful in certain cases.
  160. def checkforUTF32_BE(a):
  161. if ( (len(a) % 4) != 0 ): return False
  162. n = a.find(b'\x00\x00\x00\x20')
  163. return (n >= 0) and ((n % 4) == 0)
  164. def checkforUTF32_LE(a):
  165. if ( (len(a) % 4) != 0 ): return False
  166. n = a.find(b'\x20\x00\x00\x00')
  167. return (n >= 0) and ((n % 4) == 0)
  168. def checkforUTF16_BE(a):
  169. if ( (len(a) % 2) != 0 ): return False
  170. n = a.find(b'\x00\x20')
  171. return (n >= 0) and ((n % 2) == 0)
  172. def checkforUTF16_LE(a):
  173. if ( (len(a) % 2) != 0 ): return False
  174. n = a.find(b'\x20\x00')
  175. return (n >= 0) and ((n % 2) == 0)
  176. # Method to check for UTF8 without BOM:
  177. # "a" is the textfile represented as a simple byte array!
  178. # Find first char with code > 127:
  179. #
  180. # 1 nothing found: all bytes 0..127; in this case "a" only consists
  181. # of ASCII chars but this may also be treated as valid UTF8 coding
  182. #
  183. # 2 Code is a valid UTF8 leading byte: 176..271
  184. # then check subsequent bytes to be UTF8 extension bytes: 128..175
  185. # Does also do some additional plausibility checks:
  186. # If a valid UTF8 byte sequence is found
  187. # - the subsequent byte (after the UTF8 sequence) must be an ASCII
  188. # - or another UTF8 leading byte (in the latter case we assume that there
  189. # are following the appropriate number of UTF8 extension bytes..)
  190. # Note that these checks don't guarantee the text is really UTF8 encoded:
  191. # If a valid UTF8 sequence is found but in fact the text is some sort
  192. # of 8 bit OEM coding this may be coincidentally a sequence of 8 bit
  193. # OEM chars. This indeed seems very unlikely but may happen...
  194. # Even though the whole text would examined for UTF8 sequences: every
  195. # valid UTF8 sequence found may also be a sequence of OEM chars!
  196. #
  197. # 3 Code is not a valid UTF8 leading byte: 128..175 or 272..255
  198. # In this case coding is some sort of 8 bit OEM coding. Since we don't
  199. # know the OEM code page the file was written with, we assume "latin_1"
  200. # (is mostly the same as ANSI but "ansi" isn't available on Python 2)
  201. #
  202. # return suggested text coding: "ascii","utf_8" or "latin_1" (resp. default)
  203. def checkforUTF8(a,default="latin_1"):
  204. # Since "a" is a string array on Python 2 we use a special ORD function:
  205. # Convert c to its byte representation if it is a character
  206. # Works for Python 2+3
  207. def ORD(c): return ord(c) if (type(c) == str) else c
  208. L = len(a)
  209. n = 0
  210. while ( (n < L) and (ORD(a[n]) < 128) ): # (a[n] < ExtASCII) ):
  211. n = n+1
  212. if ( n >= L ): # all chars < 128: ASCII coding
  213. return "ascii" # but may also be treated as UTF8!
  214. w = a[n]
  215. # UTF8 two byte sequence: leading byte + 1 extension byte
  216. if ORD(w) in range(192,224):
  217. if ( (n+1 < L)
  218. and (ORD(a[n+1]) in range(128,192)) # valid UTF8 extension byte
  219. ):
  220. if ((n+2 == L) # w is last character
  221. or (ORD(a[n+2]) < 128) # or next byte is an ASCII char
  222. or (ORD(a[n+2]) in range(192,244)) # or next byte is an UTF8 leading byte
  223. ):
  224. return "utf_8"
  225. return default
  226. # UTF8 three byte sequence: leading byte + 2 extension bytes
  227. if ORD(w) in range(224,240):
  228. if ( (n+2 < L)
  229. and (ORD(a[n+1]) in range(128,192)) # 2 valid UTF8 extension bytes
  230. and (ORD(a[n+2]) in range(128,192))
  231. ):
  232. if ((n+3 == L) # w is last character
  233. or (ORD(a[n+3]) < 128) # or next byte is ASCII char
  234. or (ORD(a[n+3]) in range(192,244)) # or next byte is UTF8 leading byte
  235. ):
  236. return "utf_8"
  237. return default
  238. # UTF8 four byte sequence: leading byte + 3 extension bytes
  239. if ORD(w) in range(240,244):
  240. if ( (n+3 < L)
  241. and (ORD(a[n+1]) in range(128,192)) # 3 valid UTF8 extension bytes
  242. and (ORD(a[n+2]) in range(128,192))
  243. and (ORD(a[n+3]) in range(128,192))
  244. ):
  245. if ((n+4 == L) # w is last character
  246. or (ORD(a[n+4]) < 128) # or next byte is ASCII char
  247. or (ORD(a[n+4]) in range(192,244)) # or next byte is UTF8 leading byte
  248. ):
  249. return "utf_8"
  250. return default
  251. # no valid UTF8 byte sequence:
  252. return default
  253. # end of checkforUTF8 ------------------------------------------------
  254. # ----------------------------------------------------------------------
  255. # Subroutine readtextfile
  256. # open as binary and try to guess the encoding
  257. # attention:
  258. # - Phyton 3: "a" is a binary array
  259. # - Python 2: "a" is string array!
  260. # ----------------------------------------------------------------------
  261. f = open(filename, 'rb')
  262. a = f.read()
  263. f.close()
  264. # check for codings with BOM:
  265. # Consider the order: Check for UTF32 first!
  266. if (a.startswith(b'\xff\xfe\x00\x00')
  267. or a.startswith(b'\x00\x00\xfe\xff')):
  268. coding = "utf_32" # no suffix _be/_le --> decoder skips the BOM
  269. elif (a.startswith(b'\xff\xfe')
  270. or a.startswith(b'\xfe\xff')):
  271. coding = "utf_16" # no suffix _be/_le --> decoder skips the BOM
  272. elif a.startswith(b'\xef\xbb\xbf'):
  273. coding = "utf_8_sig"
  274. # elif: there are some other codings with BOM - feel free to add them here
  275. # check for UTF variants without BOM:
  276. # Consider the order: Check for UTF32 first!
  277. elif checkforUTF32_BE(a):
  278. coding = "utf_32_be"
  279. elif checkforUTF32_LE(a):
  280. coding = "utf_32_le"
  281. elif checkforUTF16_BE(a):
  282. coding = "utf_16_be"
  283. elif checkforUTF16_LE(a):
  284. coding = "utf_16_le"
  285. # So finally we only have to look for UTF8 without BOM:
  286. else:
  287. coding = checkforUTF8(a)
  288. # decode to text with found coding; since our guess may be wrong
  289. # we replace unknown chars to avoid errors. Cause we examine program code
  290. # files (i.e. true program code should only consist of ASCII chars) these
  291. # replacements only should affect string literals and comments and should
  292. # have no effect on metric analysis.
  293. text = a.decode(coding,'replace')
  294. # Finally replace possible line break variants with \n:
  295. # todo: replace with a regex
  296. text = text.replace("\r\n","\n")
  297. text = text.replace("\r","\n")
  298. return text
  299. # end of readtextfile --------------------------------------------------
  300. def run(self, plugin, directory):
  301. IS_TEST_MODE = False
  302. if 'METRIXPLUSPLUS_TEST_MODE' in list(os.environ.keys()):
  303. IS_TEST_MODE = True
  304. def run_per_file(plugin, fname, full_path):
  305. exit_code = 0
  306. norm_path = re.sub(r'''[\\]''', "/", full_path)
  307. if os.path.isabs(norm_path) == False and norm_path.startswith('./') == False:
  308. norm_path = './' + norm_path
  309. if plugin.is_file_excluded(norm_path) == False:
  310. if os.path.isdir(full_path):
  311. if plugin.non_recursively == False:
  312. exit_code += run_recursively(plugin, full_path)
  313. else:
  314. parser = plugin.get_parser(full_path)
  315. if parser == None:
  316. logging.info("Skipping: " + norm_path)
  317. else:
  318. logging.info("Processing: " + norm_path)
  319. ts = time.time()
  320. text = self.readtextfile(full_path)
  321. #text = self.readfile_org(full_path)
  322. checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3
  323. db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()
  324. (data, is_updated) = db_loader.create_file_data(norm_path, checksum, text)
  325. procerrors = parser.process(plugin, data, is_updated)
  326. if plugin.is_proctime_enabled == True:
  327. data.set_data('std.general', 'proctime',
  328. (time.time() - ts) if IS_TEST_MODE == False else 0.01)
  329. if plugin.is_procerrors_enabled == True and procerrors != None and procerrors != 0:
  330. data.set_data('std.general', 'procerrors', procerrors)
  331. if plugin.is_size_enabled == True:
  332. data.set_data('std.general', 'size', len(text))
  333. db_loader.save_file_data(data)
  334. #logging.debug("-" * 60)
  335. exit_code += procerrors
  336. return exit_code
  337. #thread_pool = multiprocessing.pool.ThreadPool()
  338. #def mp_worker(args):
  339. # run_per_file(args[0], args[1], args[2])
  340. def run_recursively(plugin, directory):
  341. exit_code = 0
  342. #thread_pool.map(mp_worker,
  343. # [(plugin, f, os.path.join(subdir, f))
  344. # for subdir, dirs, files in os.walk(directory) for f in files])
  345. for fname in sorted(os.listdir(directory)):
  346. full_path = os.path.join(directory, fname)
  347. exit_code += run_per_file(plugin, fname, full_path)
  348. return exit_code
  349. if os.path.exists(directory) == False:
  350. logging.error("Skipping (does not exist): " + directory)
  351. return 1
  352. if os.path.isdir(directory):
  353. total_errors = run_recursively(plugin, directory)
  354. else:
  355. total_errors = run_per_file(plugin, os.path.basename(directory), directory)
  356. total_errors = total_errors # used, warnings are per file if not zero
  357. return 0 # ignore errors, collection is successful anyway