2 years ago · 78dc5380de
--- a/metrixpp/ext/std/tools/collect.py
+++ b/metrixpp/ext/std/tools/collect.py
@@ -1,9 +1,9 @@
 
				 #

			
 
				 #    Metrix++, Copyright 2009-2019, Metrix++ Project

			
 
				 #    Link: https://github.com/metrixplusplus/metrixplusplus

			
 
				-#    

			
 
				+#

			
 
				 #    This file is a part of Metrix++ Tool.

			
 
				-#    

			
 
				+#

			
 
				 

			
 
				 

			
 
				 from metrixpp.mpp import api

			
@@ -18,7 +18,7 @@ import fnmatch
 
				 import multiprocessing.pool

			
 
				 

			
 
				 class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):

			
 
				-    

			
 
				+

			
 
				     def __init__(self):

			
 
				         self.reader = DirectoryReader()

			
 
				         self.include_rules = []

			
@@ -41,7 +41,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				         parser.add_option("--non-recursively", "--nr", action="store_true", default=False,

			
 
				                          help="If the option is set (True), sub-directories are not processed [default: %default]")

			
 
				         self.optparser = parser

			
 
				-    

			
 
				+

			
 
				     def configure(self, options):

			
 
				         self.is_proctime_enabled = options.__dict__['std.general.proctime']

			
 
				         self.is_procerrors_enabled = options.__dict__['std.general.procerrors']

			
@@ -55,7 +55,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				                 self.optparser.error("option --include-files: " + str(e))

			
 
				         else:

			
 
				             self.add_include_rule(re.compile(r'.*'))

			
 
				-        

			
 
				+

			
 
				         # check if any exclude rule is given

			
 
				         if options.__dict__['exclude_files']:

			
 
				             try:

			
@@ -78,7 +78,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				         super(Plugin, self).initialize(namespace='std.general', support_regions=False, fields=fields)

			
 
				         self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_path())

			
 
				         self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_prev_path())

			
 
				-        

			
 
				+

			
 
				     def run(self, args):

			
 
				         if len(args) == 0:

			
 
				             return self.reader.run(self, "./")

			
@@ -86,7 +86,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				         for directory in args:

			
 
				             retcode += self.reader.run(self, directory)

			
 
				         return retcode

			
 
				-        

			
 
				+

			
 
				     def register_parser(self, fnmatch_exp_list, parser):

			
 
				         self.parsers.append((fnmatch_exp_list, parser))

			
 
				 

			
@@ -126,12 +126,197 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				             if os.path.basename(each) == os.path.basename(file_name):

			
 
				                 if os.stat(each) == os.stat(file_name):

			
 
				                     return True

			
 
				-        return False 

			
 
				-        

			
 
				+        return False

			
 
				+

			
 
				 class DirectoryReader():

			
 
				-    

			
 
				+

			
 
				+    def readtextfile(self,filename):

			
 
				+        """ Read a text file and try to detect the coding

			
 
				+

			
 
				+            Since we examine program code text files we can assume the following:

			
 
				+            - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4

			
 
				+              byte, starting on 1, 2 or 4 byte boundaries (depending on

			
 
				+              1, 2 or 4 byte coding)

			
 
				+            - There should at least one space (ASCII 0x20) char

			
 
				+              of the respective length (1,2 or 4 byte))

			
 
				+            - Program code consists of only ASCII chars, i.e. code < 128

			
 
				+            - Non ASCII chars should appear in string literals and comments only

			
 
				+

			
 
				+            Though especially in the case of an 8 bit coding it does not matter

			
 
				+            which code page to use: Metric analysis is done on program code

			
 
				+            which is pure ASCII; string literals and comments are only recognized

			
 
				+            as such but not interpreted, though it doesn't matter if they contain

			
 
				+            non-ASCII chars whichever code page is used.

			
 
				+

			
 
				+            Note the decoder's different behavior for the "utf_nn" identifiers:

			
 
				+            - .decode("utf_32") / .decode("utf_16"):       preceding BOM is skipped

			
 
				+            - with suffix ".._be" or ".._le" respectively: preceding BOM is preserved

			
 
				+            but

			
 
				+            - .decode("utf_8"):     preceding BOM is preserved

			
 
				+            - .decode("utf_8_sig"): preceding BOM is skipped

			
 
				+        """

			
 
				+        # Methods to check for various UTF variants without BOM:

			
 
				+        # Since UTF16/32 codings are recommended to use a BOM these methods

			
 
				+        # shouldn't be necessary but may be useful in certain cases.

			
 
				+        def checkforUTF32_BE(a):

			
 
				+            if ( (len(a) % 4) != 0 ): return False

			
 
				+            n = a.find(b'\x00\x00\x00\x20')

			
 
				+            return (n >= 0) and ((n % 4) == 0)

			
 
				+        def checkforUTF32_LE(a):

			
 
				+            if ( (len(a) % 4) != 0 ): return False

			
 
				+            n = a.find(b'\x20\x00\x00\x00')

			
 
				+            return (n >= 0) and ((n % 4) == 0)

			
 
				+        def checkforUTF16_BE(a):

			
 
				+            if ( (len(a) % 2) != 0 ): return False

			
 
				+            n = a.find(b'\x00\x20')

			
 
				+            return (n >= 0) and ((n % 2) == 0)

			
 
				+        def checkforUTF16_LE(a):

			
 
				+            if ( (len(a) % 2) != 0 ): return False

			
 
				+            n = a.find(b'\x20\x00')

			
 
				+            return (n >= 0) and ((n % 2) == 0)

			
 
				+

			
 
				+        # Method to check for UTF8 without BOM:

			
 
				+        # "a" is the textfile represented as a simple byte array!

			
 
				+        # Find first char with code > 127:

			
 
				+        #

			
 
				+        # 1 nothing found: all bytes 0..127; in this case "a" only consists

			
 
				+        #   of ASCII chars but this may also be treated as valid UTF8 coding

			
 
				+        #

			
 
				+        # 2 Code is a valid UTF8 leading byte: 176..271

			
 
				+        #   then check subsequent bytes to be UTF8 extension bytes: 128..175

			
 
				+        #   Does also do some additional plausibility checks:

			
 
				+        #   If a valid UTF8 byte sequence is found

			
 
				+        #   - the subsequent byte (after the UTF8 sequence) must be an ASCII

			
 
				+        #   - or another UTF8 leading byte (in the latter case we assume that there

			
 
				+        #     are following the appropriate number of UTF8 extension bytes..)

			
 
				+        #   Note that these checks don't guarantee the text is really UTF8 encoded:

			
 
				+        #   If a valid UTF8 sequence is found but in fact the text is some sort

			
 
				+        #   of 8 bit OEM coding this may be coincidentally a sequence of 8 bit

			
 
				+        #   OEM chars. This indeed seems very unlikely but may happen...

			
 
				+        #   Even though the whole text would examined for UTF8 sequences: every

			
 
				+        #   valid UTF8 sequence found may also be a sequence of OEM chars!

			
 
				+        #

			
 
				+        # 3 Code is not a valid UTF8 leading byte: 128..175 or 272..255

			
 
				+        #   In this case coding is some sort of 8 bit OEM coding. Since we don't

			
 
				+        #   know the OEM code page the file was written with, we assume "latin_1"

			
 
				+        #   (is mostly the same as ANSI but "ansi" isn't available on Python 2)

			
 
				+        #

			
 
				+        # return  suggested text coding: "ascii","utf_8" or "latin_1" (resp. default)

			
 
				+        def checkforUTF8(a,default="latin_1"):

			
 
				+

			
 
				+            # Since "a" is a string array on Python 2 we use a special ORD function:

			
 
				+            # Convert c to its byte representation if it is a character

			
 
				+            # Works for Python 2+3

			
 
				+            def ORD(c): return ord(c) if (type(c) == str) else c

			
 
				+

			
 
				+            L = len(a)

			
 
				+            n = 0

			
 
				+            while ( (n < L) and (ORD(a[n]) < 128) ): # (a[n] < ExtASCII) ):

			
 
				+                n = n+1

			
 
				+            if ( n >= L ):                          # all chars < 128: ASCII coding

			
 
				+                return "ascii"                      # but may also be treated as UTF8!

			
 
				+            w = a[n]

			
 
				+

			
 
				+            # UTF8 two byte sequence: leading byte + 1 extension byte

			
 
				+            if ORD(w) in range(192,224):

			
 
				+                if ( (n+1 < L)

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # valid UTF8 extension byte

			
 
				+                ):

			
 
				+                    if ((n+2 == L)                  # w is last character

			
 
				+                     or (ORD(a[n+2]) < 128)              # or next byte is an ASCII char

			
 
				+                     or (ORD(a[n+2]) in range(192,244))  # or next byte is an UTF8 leading byte

			
 
				+                    ):

			
 
				+                        return "utf_8"

			
 
				+                return default

			
 
				+

			
 
				+            # UTF8 three byte sequence: leading byte + 2 extension bytes

			
 
				+            if ORD(w) in range(224,240):

			
 
				+                if ( (n+2 < L)

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # 2 valid UTF8 extension bytes

			
 
				+                 and (ORD(a[n+2]) in range(128,192))

			
 
				+                ):

			
 
				+                    if ((n+3 == L)                  # w is last character

			
 
				+                     or (ORD(a[n+3]) < 128)              # or next byte is ASCII char

			
 
				+                     or (ORD(a[n+3]) in range(192,244))  # or next byte is UTF8 leading byte

			
 
				+                    ):

			
 
				+                        return "utf_8"

			
 
				+                return default

			
 
				+

			
 
				+            # UTF8 four byte sequence: leading byte + 3 extension bytes

			
 
				+            if ORD(w) in range(240,244):

			
 
				+                if ( (n+3 < L)

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # 3 valid UTF8 extension bytes

			
 
				+                 and (ORD(a[n+2]) in range(128,192))

			
 
				+                 and (ORD(a[n+3]) in range(128,192))

			
 
				+                ):

			
 
				+                    if ((n+4 == L)                  # w is last character

			
 
				+                     or (ORD(a[n+4]) < 128)              # or next byte is ASCII char

			
 
				+                     or (ORD(a[n+4]) in range(192,244))  # or next byte is UTF8 leading byte

			
 
				+                    ):

			
 
				+                        return "utf_8"

			
 
				+                return default

			
 
				+

			
 
				+            # no valid UTF8 byte sequence:

			
 
				+            return default;

			
 
				+          # end of checkforUTF8 ------------------------------------------------

			
 
				+

			
 
				+        # ----------------------------------------------------------------------

			
 
				+        # Subroutine readtextfile

			
 
				+        # open as binary and try to guess the encoding

			
 
				+        # attention:

			
 
				+        # - Phyton 3: "a" is a binary array

			
 
				+        # - Python 2: "a" is string array!

			
 
				+        # ----------------------------------------------------------------------

			
 
				+        f = open(filename, 'rb');

			
 
				+        a = f.read();

			
 
				+        f.close()

			
 
				+

			
 
				+        # check for codings with BOM:

			
 
				+        # Consider the order: Check for UTF32 first!

			
 
				+        if  (a.startswith(b'\xff\xfe\x00\x00')

			
 
				+          or a.startswith(b'\x00\x00\xfe\xff')):

			
 
				+            coding = "utf_32"       # no suffix _be/_le --> decoder skips the BOM

			
 
				+        elif (a.startswith(b'\xff\xfe')

			
 
				+           or a.startswith(b'\xfe\xff')):

			
 
				+            coding = "utf_16"       # no suffix _be/_le --> decoder skips the BOM

			
 
				+        elif a.startswith(b'\xef\xbb\xbf'):

			
 
				+            coding = "utf_8_sig"

			
 
				+

			
 
				+        # elif: there are some other codings with BOM - feel free to add them here

			
 
				+

			
 
				+        # check for UTF variants without BOM:

			
 
				+        # Consider the order: Check for UTF32 first!

			
 
				+        elif checkforUTF32_BE(a):

			
 
				+            coding = "utf_32_be"

			
 
				+        elif checkforUTF32_LE(a):

			
 
				+            coding = "utf_32_le"

			
 
				+        elif checkforUTF16_BE(a):

			
 
				+            coding = "utf_16_be"

			
 
				+        elif checkforUTF16_LE(a):

			
 
				+            coding = "utf_16_le"

			
 
				+

			
 
				+        # So finally we only have to look for UTF8 without BOM:

			
 
				+        else:

			
 
				+            coding = checkforUTF8(a)

			
 
				+

			
 
				+        # decode to text with found coding; since our guess may be wrong

			
 
				+        # we replace unknown chars to avoid errors. Cause we examine program code

			
 
				+        # files (i.e. true program code should only consist of ASCII chars) these

			
 
				+        # replacements only should affect string literals and comments and should

			
 
				+        # have no effect on metric analysis.

			
 
				+        text = a.decode(coding,'replace')

			
 
				+

			
 
				+        # Finally replace possible line break variants with \n:

			
 
				+        # todo: replace with a regex

			
 
				+        text = text.replace("\r\n","\n")

			
 
				+        text = text.replace("\r","\n")

			
 
				+

			
 
				+        return text

			
 
				+

			
 
				+        # end of readtextfile --------------------------------------------------

			
 
				+

			
 
				     def run(self, plugin, directory):

			
 
				-        

			
 
				+

			
 
				         IS_TEST_MODE = False

			
 
				         if 'METRIXPLUSPLUS_TEST_MODE' in list(os.environ.keys()):

			
 
				             IS_TEST_MODE = True

			
@@ -152,22 +337,11 @@ class DirectoryReader():
 
				                     else:

			
 
				                         logging.info("Processing: " + norm_path)

			
 
				                         ts = time.time()

			
 
				-                        f = open(full_path, 'rU');

			
 
				-                        text = f.read();

			
 
				-                        # getting along with the different string handling of python 2 and 3

			
 
				-                        # trying to get along with different encodings to get the tests running

			
 
				-                        # on windows and linux

			
 
				-                        try:

			
 
				-                            text = text.encode(f.encoding)

			
 
				-                        except:

			
 
				-                            pass

			
 
				-                        try:

			
 
				-                            text = text.decode('utf-8')

			
 
				-                        except:

			
 
				-                            pass

			
 
				-                        f.close()

			
 
				+

			
 
				+                        text = self.readtextfile(full_path)

			
 
				+                        #text = self.readfile_org(full_path)

			
 
				                         checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3

			
 
				-                        

			
 
				+

			
 
				                         db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()

			
 
				                         (data, is_updated) = db_loader.create_file_data(norm_path, checksum, text)

			
 
				                         procerrors = parser.process(plugin, data, is_updated)

			
@@ -184,7 +358,7 @@ class DirectoryReader():
 
				             else:

			
 
				                 logging.info("Excluding: " + norm_path)

			
 
				             return exit_code

			
 
				-        

			
 
				+

			
 
				 

			
 
				         #thread_pool = multiprocessing.pool.ThreadPool()

			
 
				         #def mp_worker(args):

			
@@ -197,13 +371,13 @@ class DirectoryReader():
 
				             for fname in sorted(os.listdir(directory)):

			
 
				                 full_path = os.path.join(directory, fname)

			
 
				                 exit_code += run_per_file(plugin, fname, full_path)

			
 
				-            

			
 
				+

			
 
				             return exit_code

			
 
				-        

			
 
				+

			
 
				         if os.path.exists(directory) == False:

			
 
				             logging.error("Skipping (does not exist): " + directory)

			
 
				             return 1

			
 
				-        

			
 
				+

			
 
				         if os.path.isdir(directory):

			
 
				             total_errors = run_recursively(plugin, directory)

			
 
				         else: