3 年之前 · 1a0207fcda
--- a/metrixpp/ext/std/tools/collect.py
+++ b/metrixpp/ext/std/tools/collect.py
@@ -1,9 +1,9 @@
 
																 #

															
 
																 #    Metrix++, Copyright 2009-2019, Metrix++ Project

															
 
																 #    Link: https://github.com/metrixplusplus/metrixplusplus

															
 
																-#    

															
 
																+#

															
 
																 #    This file is a part of Metrix++ Tool.

															
 
																-#    

															
 
																+#

															
 
																 from metrixpp.mpp import api

															
@@ -18,7 +18,7 @@ import fnmatch
 
																 import multiprocessing.pool

															
 
																 class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):

															
 
																-    

															
 
																+

															
 
																     def __init__(self):

															
 
																         self.reader = DirectoryReader()

															
 
																         self.include_rules = []

															
@@ -41,7 +41,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
																         parser.add_option("--non-recursively", "--nr", action="store_true", default=False,

															
 
																                          help="If the option is set (True), sub-directories are not processed [default: %default]")

															
 
																         self.optparser = parser

															
 
																-    

															
 
																+

															
 
																     def configure(self, options):

															
 
																         self.is_proctime_enabled = options.__dict__['std.general.proctime']

															
 
																         self.is_procerrors_enabled = options.__dict__['std.general.procerrors']

															
@@ -55,7 +55,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
																                 self.optparser.error("option --include-files: " + str(e))

															
 
																         else:

															
 
																             self.add_include_rule(re.compile(r'.*'))

															
 
																-        

															
 
																+

															
 
																         # check if any exclude rule is given

															
 
																         if options.__dict__['exclude_files']:

															
 
																             try:

															
@@ -78,7 +78,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
																         super(Plugin, self).initialize(namespace='std.general', support_regions=False, fields=fields)

															
 
																         self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_path())

															
 
																         self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_prev_path())

															
 
																-        

															
 
																+

															
 
																     def run(self, args):

															
 
																         if len(args) == 0:

															
 
																             return self.reader.run(self, "./")

															
@@ -86,7 +86,7 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
																         for directory in args:

															
 
																             retcode += self.reader.run(self, directory)

															
 
																         return retcode

															
 
																-        

															
 
																+

															
 
																     def register_parser(self, fnmatch_exp_list, parser):

															
 
																         self.parsers.append((fnmatch_exp_list, parser))

															
@@ -126,12 +126,132 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
																             if os.path.basename(each) == os.path.basename(file_name):

															
 
																                 if os.stat(each) == os.stat(file_name):

															
 
																                     return True

															
 
																-        return False 

															
 
																-        

															
 
																+        return False

															
 
																+

															
 
																 class DirectoryReader():

															
 
																-    

															
 
																+

															
 
																+    def readtextfile(self,filename):

															
 
																+        """ Read a text file and try to detect the coding

															
 
																+        """

															
 
																+

															
 
																+        # Subroutine - Check for UTF8:

															
 
																+        # "a" is the textfile represented as a simple byte array!

															
 
																+        # Find first char with code > 127:

															
 
																+        # 1 nothing found: all bytes in range(0..127); in this case "a" only consists

															
 
																+        #   of ASCII chars but this may also be treated as valid UTF8 coding

															
 
																+        # 2 Code is a valid UTF8 leading byte: range(176,271)

															
 
																+        #   then check subsequent bytes to be UTF8 extension bytes: range(128,175)

															
 
																+        #   Does also do some plausibility checks: If a valid UTF8 byte sequence is found

															
 
																+        #   - the subsequent byte (after the UTF8 sequence) must be an ASCII or

															
 
																+        #     another UTF8 leading byte (in the latter case we assume that there

															
 
																+        #     are following the appropriate number of UTF8 extension bytes..)

															
 
																+        #   Note that this checks don't guarantee the text is really UTF8 encoded:

															
 
																+        #   If a valid UTF8 sequence is found but in fact the text is some sort

															
 
																+        #   of 8 bit OEM coding this may be coincidentally a sequence of 8 bit

															
 
																+        #   OEM chars. This indeed seems very unlikely but may happen...

															
 
																+        #   Otherwise the whole text has to be examined for UTF8 sequences.

															
 
																+        # 3 Code is not a valid UTF8 leading byte: range(128,175) or range(272,255)

															
 
																+        #   In this case coding is some sort of 8 bit OEM coding. Since we don't

															
 
																+        #   know the OEM code page the file was written with, we assume "latin_1"

															
 
																+        #   (is mostly the same as ANSI but "ansi" isn't available on Python 2)

															
 
																+        #

															
 
																+        # return  suggested text coding: "ascii","utf_8" or "latin_1"

															
 
																+        def checkforUTF8(a,default="latin_1"):

															
 
																+            L = len(a)

															
 
																+            n = 0

															
 
																+            while ( (n < L) and (a[n] < 128) ):

															
 
																+                n = n+1

															
 
																+            if ( n >= L ):                          # all chars < 128: ASCII coding

															
 
																+                return "ascii"                      # but may also be treated as UTF8!

															
 
																+

															
 
																+            w = a[n]

															
 
																+

															
 
																+            if w in range(176,207):                 # UTF8 two byte sequence: leading byte + 1 extension byte

															
 
																+                if ( (n+1 < L)

															
 
																+                 and (a[n+1] in range(128,175))     # valid UTF8 extension byte

															
 
																+                ):

															
 
																+                    if ((n+2 == L)                  # w is last UTF8 character

															
 
																+                     or (a[n+2] < 128)              # or next byte is an ASCII char

															
 
																+                     or (a[n+2] in range(176,271))  # or next byte is an UTF8 leading byte

															
 
																+                    ):

															
 
																+                        return "utf_8"

															
 
																+                return default

															
 
																+

															
 
																+            if w in range(208,239):                 # UTF8 three byte sequence: leading byte + 2 extension bytes

															
 
																+                if ( (n+2 < L)

															
 
																+                 and (a[n+1] in range(128,175))     # 2 valid UTF8 extension bytes

															
 
																+                 and (a[n+2] in range(128,175))

															
 
																+                ):

															
 
																+                    if ((n+3 == L)                  # w is last UTF8 character

															
 
																+                     or (a[n+3] < 128)              # or next byte is ASCII char

															
 
																+                     or (a[n+3] in range(176,271))  # or next byte is UTF8 leading byte

															
 
																+                    ):

															
 
																+                        return "utf_8"

															
 
																+                return default

															
 
																+

															
 
																+            if w in range(240,271):                 # UTF8 four byte sequence: leading byte + 3 extension bytes

															
 
																+                if ( (n+3 < L)

															
 
																+                 and (a[n+1] in range(128,175))     # 3 valid UTF8 extension bytes

															
 
																+                 and (a[n+2] in range(128,175))

															
 
																+                 and (a[n+3] in range(128,175))

															
 
																+                ):

															
 
																+                    if ((n+4 == L)                  # w is last UTF8 character

															
 
																+                     or (a[n+4] < 128)              # or next byte is ASCII char

															
 
																+                     or (a[n+4] in range(176,271))  # or next byte is UTF8 leading byte

															
 
																+                    ):

															
 
																+                        return "utf_8"

															
 
																+                return default

															
 
																+          # end of checkforUTF8 ------------------------------------------------

															
 
																+

															
 
																+        # Subroutine readtextfile

															
 
																+        # open as binary and try to guess the encoding:

															
 
																+        f = open(filename, 'rb');

															
 
																+        a = f.read();

															
 
																+        f.close()

															
 
																+

															
 
																+        # check for codings with BOM:

															
 
																+        if a.startswith(b'\xff\xfe'):

															
 
																+            coding = "utf_16_le"

															
 
																+        elif a.startswith(b'\xfe\xff'):

															
 
																+            coding = "utf_16_be"

															
 
																+        elif a.startswith(b'\xff\xfe\x00\x00'):

															
 
																+            coding = "utf_32_le"

															
 
																+        elif a.startswith(b'\x00\x00\xfe\xff'):

															
 
																+            coding = "utf_32_be"

															
 
																+        elif a.startswith(b'\xef\xbb\xbf'):

															
 
																+            coding = "utf_8_sig"

															
 
																+        # elif: there are some other codings with BOM - feel free to add them here

															
 
																+

															
 
																+        # elif: check UTF variants without BOM:

															
 
																+        # at this point one may try to determine UTF16 or UTF32 codings without BOM

															
 
																+        # but this should not happen since these codings strictly require a BOM.

															
 
																+        # So finally we only have to look for UTF8 without BOM:

															
 
																+        else:

															
 
																+            coding = checkforUTF8(a)

															
 
																+

															
 
																+        # decode to text with found coding; since our guess may be wrong

															
 
																+        # we replace unknown chars to avoid errors. Cause we examine program code

															
 
																+        # files (i.e. true program code should only consist of ASCII chars) these

															
 
																+        # replacements only should affect string literals and comments and should

															
 
																+        # have no effect to metric analysis.

															
 
																+        text = a.decode(coding,'replace')

															
 
																+

															
 
																+        # Finally replace possible line break variants with \n:

															
 
																+        # todo: replace with a regex

															
 
																+        text = text.replace("\r\n","\n")

															
 
																+        text = text.replace("\r","\n")

															
 
																+

															
 
																+        # debug:

															
 
																+        #print(filename+" - Coding found = "+coding+" len: "+str(len(text)))

															
 
																+        #f = open(filename+"."+asCoding,'wb')

															
 
																+        #f.write(text.encode(coding))

															
 
																+        #f.close

															
 
																+

															
 
																+        return text

															
 
																+        # end of readtextfile --------------------------------------------------

															
 
																+

															
 
																     def run(self, plugin, directory):

															
 
																-        

															
 
																+

															
 
																         IS_TEST_MODE = False

															
 
																         if 'METRIXPLUSPLUS_TEST_MODE' in list(os.environ.keys()):

															
 
																             IS_TEST_MODE = True

															
@@ -152,22 +272,9 @@ class DirectoryReader():
 
																                     else:

															
 
																                         logging.info("Processing: " + norm_path)

															
 
																                         ts = time.time()

															
 
																-                        f = open(full_path, 'rU');

															
 
																-                        text = f.read();

															
 
																-                        # getting along with the different string handling of python 2 and 3

															
 
																-                        # trying to get along with different encodings to get the tests running

															
 
																-                        # on windows and linux

															
 
																-                        try:

															
 
																-                            text = text.encode(f.encoding)

															
 
																-                        except:

															
 
																-                            pass

															
 
																-                        try:

															
 
																-                            text = text.decode('utf-8')

															
 
																-                        except:

															
 
																-                            pass

															
 
																-                        f.close()

															
 
																+                        text = self.readtextfile(full_path)

															
 
																                         checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3

															
 
																-                        

															
 
																+

															
 
																                         db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()

															
 
																                         (data, is_updated) = db_loader.create_file_data(norm_path, checksum, text)

															
 
																                         procerrors = parser.process(plugin, data, is_updated)

															
@@ -184,7 +291,7 @@ class DirectoryReader():
 
																             else:

															
 
																                 logging.info("Excluding: " + norm_path)

															
 
																             return exit_code

															
 
																-        

															
 
																+

															
 
																         #thread_pool = multiprocessing.pool.ThreadPool()

															
 
																         #def mp_worker(args):

															
@@ -197,13 +304,13 @@ class DirectoryReader():
 
																             for fname in sorted(os.listdir(directory)):

															
 
																                 full_path = os.path.join(directory, fname)

															
 
																                 exit_code += run_per_file(plugin, fname, full_path)

															
 
																-            

															
 
																+

															
 
																             return exit_code

															
 
																-        

															
 
																+

															
 
																         if os.path.exists(directory) == False:

															
 
																             logging.error("Skipping (does not exist): " + directory)

															
 
																             return 1

															
 
																-        

															
 
																+

															
 
																         if os.path.isdir(directory):

															
 
																             total_errors = run_recursively(plugin, directory)

															
 
																         else: