|
@@ -1,9 +1,9 @@
|
|
|
#
|
|
|
# Metrix++, Copyright 2009-2019, Metrix++ Project
|
|
|
# Link: https://github.com/metrixplusplus/metrixplusplus
|
|
|
-#
|
|
|
+#
|
|
|
# This file is a part of Metrix++ Tool.
|
|
|
-#
|
|
|
+#
|
|
|
|
|
|
|
|
|
from metrixpp.mpp import api
|
|
@@ -18,7 +18,7 @@ import fnmatch
|
|
|
import multiprocessing.pool
|
|
|
|
|
|
class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
-
|
|
|
+
|
|
|
def __init__(self):
|
|
|
self.reader = DirectoryReader()
|
|
|
self.include_rules = []
|
|
@@ -41,7 +41,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
parser.add_option("--non-recursively", "--nr", action="store_true", default=False,
|
|
|
help="If the option is set (True), sub-directories are not processed [default: %default]")
|
|
|
self.optparser = parser
|
|
|
-
|
|
|
+
|
|
|
def configure(self, options):
|
|
|
self.is_proctime_enabled = options.__dict__['std.general.proctime']
|
|
|
self.is_procerrors_enabled = options.__dict__['std.general.procerrors']
|
|
@@ -55,7 +55,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
self.optparser.error("option --include-files: " + str(e))
|
|
|
else:
|
|
|
self.add_include_rule(re.compile(r'.*'))
|
|
|
-
|
|
|
+
|
|
|
# check if any exclude rule is given
|
|
|
if options.__dict__['exclude_files']:
|
|
|
try:
|
|
@@ -78,7 +78,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
super(Plugin, self).initialize(namespace='std.general', support_regions=False, fields=fields)
|
|
|
self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_path())
|
|
|
self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_prev_path())
|
|
|
-
|
|
|
+
|
|
|
def run(self, args):
|
|
|
if len(args) == 0:
|
|
|
return self.reader.run(self, "./")
|
|
@@ -86,7 +86,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
for directory in args:
|
|
|
retcode += self.reader.run(self, directory)
|
|
|
return retcode
|
|
|
-
|
|
|
+
|
|
|
def register_parser(self, fnmatch_exp_list, parser):
|
|
|
self.parsers.append((fnmatch_exp_list, parser))
|
|
|
|
|
@@ -126,12 +126,197 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
|
|
|
if os.path.basename(each) == os.path.basename(file_name):
|
|
|
if os.stat(each) == os.stat(file_name):
|
|
|
return True
|
|
|
- return False
|
|
|
-
|
|
|
+ return False
|
|
|
+
|
|
|
class DirectoryReader():
|
|
|
-
|
|
|
+
|
|
|
+ def readtextfile(self,filename):
|
|
|
+ """ Read a text file and try to detect the coding
|
|
|
+
|
|
|
+ Since we examine program code text files we can assume the following:
|
|
|
+ - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4
|
|
|
+ byte, starting on 1, 2 or 4 byte boundaries (depending on
|
|
|
+ 1, 2 or 4 byte coding)
|
|
|
+ - There should at least one space (ASCII 0x20) char
|
|
|
+ of the respective length (1,2 or 4 byte))
|
|
|
+ - Program code consists of only ASCII chars, i.e. code < 128
|
|
|
+ - Non ASCII chars should appear in string literals and comments only
|
|
|
+
|
|
|
+ Though especially in the case of an 8 bit coding it does not matter
|
|
|
+ which code page to use: Metric analysis is done on program code
|
|
|
+ which is pure ASCII; string literals and comments are only recognized
|
|
|
+ as such but not interpreted, though it doesn't matter if they contain
|
|
|
+ non-ASCII chars whichever code page is used.
|
|
|
+
|
|
|
+ Note the decoder's different behavior for the "utf_nn" identifiers:
|
|
|
+ - .decode("utf_32") / .decode("utf_16"): preceding BOM is skipped
|
|
|
+ - with suffix ".._be" or ".._le" respectively: preceding BOM is preserved
|
|
|
+ but
|
|
|
+ - .decode("utf_8"): preceding BOM is preserved
|
|
|
+ - .decode("utf_8_sig"): preceding BOM is skipped
|
|
|
+ """
|
|
|
+ # Methods to check for various UTF variants without BOM:
|
|
|
+ # Since UTF16/32 codings are recommended to use a BOM these methods
|
|
|
+ # shouldn't be necessary but may be useful in certain cases.
|
|
|
+ def checkforUTF32_BE(a):
|
|
|
+ if ( (len(a) % 4) != 0 ): return False
|
|
|
+ n = a.find(b'\x00\x00\x00\x20')
|
|
|
+ return (n >= 0) and ((n % 4) == 0)
|
|
|
+ def checkforUTF32_LE(a):
|
|
|
+ if ( (len(a) % 4) != 0 ): return False
|
|
|
+ n = a.find(b'\x20\x00\x00\x00')
|
|
|
+ return (n >= 0) and ((n % 4) == 0)
|
|
|
+ def checkforUTF16_BE(a):
|
|
|
+ if ( (len(a) % 2) != 0 ): return False
|
|
|
+ n = a.find(b'\x00\x20')
|
|
|
+ return (n >= 0) and ((n % 2) == 0)
|
|
|
+ def checkforUTF16_LE(a):
|
|
|
+ if ( (len(a) % 2) != 0 ): return False
|
|
|
+ n = a.find(b'\x20\x00')
|
|
|
+ return (n >= 0) and ((n % 2) == 0)
|
|
|
+
|
|
|
+ # Method to check for UTF8 without BOM:
|
|
|
+ # "a" is the textfile represented as a simple byte array!
|
|
|
+ # Find first char with code > 127:
|
|
|
+ #
|
|
|
+ # 1 nothing found: all bytes 0..127; in this case "a" only consists
|
|
|
+ # of ASCII chars but this may also be treated as valid UTF8 coding
|
|
|
+ #
|
|
|
+ # 2 Code is a valid UTF8 leading byte: 176..271
|
|
|
+ # then check subsequent bytes to be UTF8 extension bytes: 128..175
|
|
|
+ # Does also do some additional plausibility checks:
|
|
|
+ # If a valid UTF8 byte sequence is found
|
|
|
+ # - the subsequent byte (after the UTF8 sequence) must be an ASCII
|
|
|
+ # - or another UTF8 leading byte (in the latter case we assume that there
|
|
|
+ # are following the appropriate number of UTF8 extension bytes..)
|
|
|
+ # Note that these checks don't guarantee the text is really UTF8 encoded:
|
|
|
+ # If a valid UTF8 sequence is found but in fact the text is some sort
|
|
|
+ # of 8 bit OEM coding this may be coincidentally a sequence of 8 bit
|
|
|
+ # OEM chars. This indeed seems very unlikely but may happen...
|
|
|
+ # Even though the whole text would examined for UTF8 sequences: every
|
|
|
+ # valid UTF8 sequence found may also be a sequence of OEM chars!
|
|
|
+ #
|
|
|
+ # 3 Code is not a valid UTF8 leading byte: 128..175 or 272..255
|
|
|
+ # In this case coding is some sort of 8 bit OEM coding. Since we don't
|
|
|
+ # know the OEM code page the file was written with, we assume "latin_1"
|
|
|
+ # (is mostly the same as ANSI but "ansi" isn't available on Python 2)
|
|
|
+ #
|
|
|
+ # return suggested text coding: "ascii","utf_8" or "latin_1" (resp. default)
|
|
|
+ def checkforUTF8(a,default="latin_1"):
|
|
|
+
|
|
|
+ # Since "a" is a string array on Python 2 we use a special ORD function:
|
|
|
+ # Convert c to its byte representation if it is a character
|
|
|
+ # Works for Python 2+3
|
|
|
+ def ORD(c): return ord(c) if (type(c) == str) else c
|
|
|
+
|
|
|
+ L = len(a)
|
|
|
+ n = 0
|
|
|
+ while ( (n < L) and (ORD(a[n]) < 128) ): # (a[n] < ExtASCII) ):
|
|
|
+ n = n+1
|
|
|
+ if ( n >= L ): # all chars < 128: ASCII coding
|
|
|
+ return "ascii" # but may also be treated as UTF8!
|
|
|
+ w = a[n]
|
|
|
+
|
|
|
+ # UTF8 two byte sequence: leading byte + 1 extension byte
|
|
|
+ if ORD(w) in range(192,224):
|
|
|
+ if ( (n+1 < L)
|
|
|
+ and (ORD(a[n+1]) in range(128,192)) # valid UTF8 extension byte
|
|
|
+ ):
|
|
|
+ if ((n+2 == L) # w is last character
|
|
|
+ or (ORD(a[n+2]) < 128) # or next byte is an ASCII char
|
|
|
+ or (ORD(a[n+2]) in range(192,244)) # or next byte is an UTF8 leading byte
|
|
|
+ ):
|
|
|
+ return "utf_8"
|
|
|
+ return default
|
|
|
+
|
|
|
+ # UTF8 three byte sequence: leading byte + 2 extension bytes
|
|
|
+ if ORD(w) in range(224,240):
|
|
|
+ if ( (n+2 < L)
|
|
|
+ and (ORD(a[n+1]) in range(128,192)) # 2 valid UTF8 extension bytes
|
|
|
+ and (ORD(a[n+2]) in range(128,192))
|
|
|
+ ):
|
|
|
+ if ((n+3 == L) # w is last character
|
|
|
+ or (ORD(a[n+3]) < 128) # or next byte is ASCII char
|
|
|
+ or (ORD(a[n+3]) in range(192,244)) # or next byte is UTF8 leading byte
|
|
|
+ ):
|
|
|
+ return "utf_8"
|
|
|
+ return default
|
|
|
+
|
|
|
+ # UTF8 four byte sequence: leading byte + 3 extension bytes
|
|
|
+ if ORD(w) in range(240,244):
|
|
|
+ if ( (n+3 < L)
|
|
|
+ and (ORD(a[n+1]) in range(128,192)) # 3 valid UTF8 extension bytes
|
|
|
+ and (ORD(a[n+2]) in range(128,192))
|
|
|
+ and (ORD(a[n+3]) in range(128,192))
|
|
|
+ ):
|
|
|
+ if ((n+4 == L) # w is last character
|
|
|
+ or (ORD(a[n+4]) < 128) # or next byte is ASCII char
|
|
|
+ or (ORD(a[n+4]) in range(192,244)) # or next byte is UTF8 leading byte
|
|
|
+ ):
|
|
|
+ return "utf_8"
|
|
|
+ return default
|
|
|
+
|
|
|
+ # no valid UTF8 byte sequence:
|
|
|
+ return default;
|
|
|
+ # end of checkforUTF8 ------------------------------------------------
|
|
|
+
|
|
|
+ # ----------------------------------------------------------------------
|
|
|
+ # Subroutine readtextfile
|
|
|
+ # open as binary and try to guess the encoding
|
|
|
+ # attention:
|
|
|
+ # - Phyton 3: "a" is a binary array
|
|
|
+ # - Python 2: "a" is string array!
|
|
|
+ # ----------------------------------------------------------------------
|
|
|
+ f = open(filename, 'rb');
|
|
|
+ a = f.read();
|
|
|
+ f.close()
|
|
|
+
|
|
|
+ # check for codings with BOM:
|
|
|
+ # Consider the order: Check for UTF32 first!
|
|
|
+ if (a.startswith(b'\xff\xfe\x00\x00')
|
|
|
+ or a.startswith(b'\x00\x00\xfe\xff')):
|
|
|
+ coding = "utf_32" # no suffix _be/_le --> decoder skips the BOM
|
|
|
+ elif (a.startswith(b'\xff\xfe')
|
|
|
+ or a.startswith(b'\xfe\xff')):
|
|
|
+ coding = "utf_16" # no suffix _be/_le --> decoder skips the BOM
|
|
|
+ elif a.startswith(b'\xef\xbb\xbf'):
|
|
|
+ coding = "utf_8_sig"
|
|
|
+
|
|
|
+ # elif: there are some other codings with BOM - feel free to add them here
|
|
|
+
|
|
|
+ # check for UTF variants without BOM:
|
|
|
+ # Consider the order: Check for UTF32 first!
|
|
|
+ elif checkforUTF32_BE(a):
|
|
|
+ coding = "utf_32_be"
|
|
|
+ elif checkforUTF32_LE(a):
|
|
|
+ coding = "utf_32_le"
|
|
|
+ elif checkforUTF16_BE(a):
|
|
|
+ coding = "utf_16_be"
|
|
|
+ elif checkforUTF16_LE(a):
|
|
|
+ coding = "utf_16_le"
|
|
|
+
|
|
|
+ # So finally we only have to look for UTF8 without BOM:
|
|
|
+ else:
|
|
|
+ coding = checkforUTF8(a)
|
|
|
+
|
|
|
+ # decode to text with found coding; since our guess may be wrong
|
|
|
+ # we replace unknown chars to avoid errors. Cause we examine program code
|
|
|
+ # files (i.e. true program code should only consist of ASCII chars) these
|
|
|
+ # replacements only should affect string literals and comments and should
|
|
|
+ # have no effect on metric analysis.
|
|
|
+ text = a.decode(coding,'replace')
|
|
|
+
|
|
|
+ # Finally replace possible line break variants with \n:
|
|
|
+ # todo: replace with a regex
|
|
|
+ text = text.replace("\r\n","\n")
|
|
|
+ text = text.replace("\r","\n")
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
+ # end of readtextfile --------------------------------------------------
|
|
|
+
|
|
|
def run(self, plugin, directory):
|
|
|
-
|
|
|
+
|
|
|
IS_TEST_MODE = False
|
|
|
if 'METRIXPLUSPLUS_TEST_MODE' in list(os.environ.keys()):
|
|
|
IS_TEST_MODE = True
|
|
@@ -152,22 +337,11 @@ class DirectoryReader():
|
|
|
else:
|
|
|
logging.info("Processing: " + norm_path)
|
|
|
ts = time.time()
|
|
|
- f = open(full_path, 'rU');
|
|
|
- text = f.read();
|
|
|
- # getting along with the different string handling of python 2 and 3
|
|
|
- # trying to get along with different encodings to get the tests running
|
|
|
- # on windows and linux
|
|
|
- try:
|
|
|
- text = text.encode(f.encoding)
|
|
|
- except:
|
|
|
- pass
|
|
|
- try:
|
|
|
- text = text.decode('utf-8')
|
|
|
- except:
|
|
|
- pass
|
|
|
- f.close()
|
|
|
+
|
|
|
+ text = self.readtextfile(full_path)
|
|
|
+ #text = self.readfile_org(full_path)
|
|
|
checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3
|
|
|
-
|
|
|
+
|
|
|
db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()
|
|
|
(data, is_updated) = db_loader.create_file_data(norm_path, checksum, text)
|
|
|
procerrors = parser.process(plugin, data, is_updated)
|
|
@@ -184,7 +358,7 @@ class DirectoryReader():
|
|
|
else:
|
|
|
logging.info("Excluding: " + norm_path)
|
|
|
return exit_code
|
|
|
-
|
|
|
+
|
|
|
|
|
|
#thread_pool = multiprocessing.pool.ThreadPool()
|
|
|
#def mp_worker(args):
|
|
@@ -197,13 +371,13 @@ class DirectoryReader():
|
|
|
for fname in sorted(os.listdir(directory)):
|
|
|
full_path = os.path.join(directory, fname)
|
|
|
exit_code += run_per_file(plugin, fname, full_path)
|
|
|
-
|
|
|
+
|
|
|
return exit_code
|
|
|
-
|
|
|
+
|
|
|
if os.path.exists(directory) == False:
|
|
|
logging.error("Skipping (does not exist): " + directory)
|
|
|
return 1
|
|
|
-
|
|
|
+
|
|
|
if os.path.isdir(directory):
|
|
|
total_errors = run_recursively(plugin, directory)
|
|
|
else:
|