2 years ago · a2af7e8730
--- a/metrixpp/ext/std/tools/collect.py
+++ b/metrixpp/ext/std/tools/collect.py
@@ -130,6 +130,23 @@ class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
 
				 

			
 
				 class DirectoryReader():

			
 
				 

			
 
				+    def debugout(self,filename,text,coding):

			
 
				+        # Open as Text-File - explicit "utf-8" since default depends on machine's OS

			
 
				+        #f = open(filename+".utf-8",'w',encoding="utf-8")

			
 
				+        #f.write(text)

			
 
				+

			
 
				+        # Open as Binary-File

			
 
				+        #coding = coding    # write in original coding: If our guess was true written file should have the same size as the original file

			
 
				+        # or:

			
 
				+        #coding = "utf-8"   # write as UTF-8:

			
 
				+                            # If our guess is true same files in different encodings should result in identical files

			
 
				+                            # If our guess was false (esp. if real coding is an 8-bit coding != latin_1) file length may be different!

			
 
				+        #f = open(filename+"."+coding,'wb')

			
 
				+        #f.write(text.encode(coding))

			
 
				+

			
 
				+        #f.close

			
 
				+        return

			
 
				+

			
 
				     def readtextfile(self,filename):

			
 
				         """ Read a text file and try to detect the coding

			
 
				 

			
@@ -137,10 +154,10 @@ class DirectoryReader():
 
				             - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4

			
 
				               byte, starting on 1, 2 or 4 byte boundaries (depending on

			
 
				               1, 2 or 4 byte coding)

			
 
				-            - There should at least 1 line terminated with an end of line

			
 
				-              character, i.e. \n or \r of the respective length (1,2 or 4 byte)

			
 
				+            - There should at least one space (ASCII 0x20) char

			
 
				+              of the respective length (1,2 or 4 byte))

			
 
				             - Program code consists of only ASCII chars, i.e. code < 128

			
 
				-            - Non ASCII chars should only appear in string literals and comments

			
 
				+            - Non ASCII chars should appear in string literals and comments only

			
 
				 

			
 
				             Though especially in the case of an 8 bit coding it does not matter

			
 
				             which code page to use: Metric analysis is done on program code

			
@@ -148,50 +165,42 @@ class DirectoryReader():
 
				             as such but not interpreted, though it doesn't matter if they contain

			
 
				             non-ASCII chars whichever code page is used.

			
 
				 

			
 
				-            Note the decoder's different behavior for the "utf-nn" identifiers:

			
 
				-            - .decode("utf-32") / .decode("utf-16"):       preceding BOM is skipped

			
 
				-            - with suffix "..-be" or "..-le" respectively: preceding BOM is preserved

			
 
				+            Note the decoder's different behavior for the "utf_nn" identifiers:

			
 
				+            - .decode("utf_32") / .decode("utf_16"):       preceding BOM is skipped

			
 
				+            - with suffix ".._be" or ".._le" respectively: preceding BOM is preserved

			
 
				             but

			
 
				-            - .decode("utf-8"):     preceding BOM is preserved

			
 
				-            - .decode("utf-8-sig"): preceding BOM is skipped

			
 
				+            - .decode("utf_8"):     preceding BOM is preserved

			
 
				+            - .decode("utf_8_sig"): preceding BOM is skipped

			
 
				         """

			
 
				         # Methods to check for various UTF variants without BOM:

			
 
				         # Since UTF16/32 codings are recommended to use a BOM these methods

			
 
				         # shouldn't be necessary but may be useful in certain cases.

			
 
				         def checkforUTF32_BE(a):

			
 
				             if ( (len(a) % 4) != 0 ): return False

			
 
				-            n = a.find(b'\x00\x00\x00\n')

			
 
				-            if n < 0:

			
 
				-                n = a.find(b'\x00\x00\x00\r')

			
 
				+            n = a.find(b'\x00\x00\x00\x20')

			
 
				             return (n >= 0) and ((n % 4) == 0)

			
 
				         def checkforUTF32_LE(a):

			
 
				             if ( (len(a) % 4) != 0 ): return False

			
 
				-            n = a.find(b'\n\x00\x00\x00')

			
 
				-            if n < 0:

			
 
				-                n = a.find(b'\r\x00\x00\x00')

			
 
				+            n = a.find(b'\x20\x00\x00\x00')

			
 
				             return (n >= 0) and ((n % 4) == 0)

			
 
				         def checkforUTF16_BE(a):

			
 
				             if ( (len(a) % 2) != 0 ): return False

			
 
				-            n = a.find(b'\x00\n')

			
 
				-            if n < 0:

			
 
				-                n = a.find(b'\x00\r')

			
 
				+            n = a.find(b'\x00\x20')

			
 
				             return (n >= 0) and ((n % 2) == 0)

			
 
				         def checkforUTF16_LE(a):

			
 
				             if ( (len(a) % 2) != 0 ): return False

			
 
				-            n = a.find(b'\n\x00')

			
 
				-            if n < 0:

			
 
				-                n = a.find(b'\r\x00')

			
 
				+            n = a.find(b'\x20\x00')

			
 
				             return (n >= 0) and ((n % 2) == 0)

			
 
				 

			
 
				         # Method to check for UTF8 without BOM:

			
 
				         # "a" is the textfile represented as a simple byte array!

			
 
				         # Find first char with code > 127:

			
 
				         #

			
 
				-        # 1 nothing found: all bytes in range(0..127); in this case "a" only consists

			
 
				+        # 1 nothing found: all bytes 0..127; in this case "a" only consists

			
 
				         #   of ASCII chars but this may also be treated as valid UTF8 coding

			
 
				         #

			
 
				-        # 2 Code is a valid UTF8 leading byte: range(176,271)

			
 
				-        #   then check subsequent bytes to be UTF8 extension bytes: range(128,175)

			
 
				+        # 2 Code is a valid UTF8 leading byte: 176..271

			
 
				+        #   then check subsequent bytes to be UTF8 extension bytes: 128..175

			
 
				         #   Does also do some additional plausibility checks:

			
 
				         #   If a valid UTF8 byte sequence is found

			
 
				         #   - the subsequent byte (after the UTF8 sequence) must be an ASCII

			
@@ -201,60 +210,65 @@ class DirectoryReader():
 
				         #   If a valid UTF8 sequence is found but in fact the text is some sort

			
 
				         #   of 8 bit OEM coding this may be coincidentally a sequence of 8 bit

			
 
				         #   OEM chars. This indeed seems very unlikely but may happen...

			
 
				-        #   Even though the whole text was examined for UTF8 sequences: every

			
 
				+        #   Even though the whole text would examined for UTF8 sequences: every

			
 
				         #   valid UTF8 sequence found may also be a sequence of OEM chars!

			
 
				         #

			
 
				-        # 3 Code is not a valid UTF8 leading byte: range(128,175) or range(272,255)

			
 
				+        # 3 Code is not a valid UTF8 leading byte: 128..175 or 272..255

			
 
				         #   In this case coding is some sort of 8 bit OEM coding. Since we don't

			
 
				         #   know the OEM code page the file was written with, we assume "latin_1"

			
 
				         #   (is mostly the same as ANSI but "ansi" isn't available on Python 2)

			
 
				         #

			
 
				         # return  suggested text coding: "ascii","utf_8" or "latin_1" (resp. default)

			
 
				         def checkforUTF8(a,default="latin_1"):

			
 
				+

			
 
				+            # Since "a" is a string array on Python 2 we use a special ORD function:

			
 
				+            # Convert c to its byte representation if it is a character

			
 
				+            # Works for Python 2+3

			
 
				+            def ORD(c): return ord(c) if (type(c) == str) else c

			
 
				+

			
 
				             L = len(a)

			
 
				             n = 0

			
 
				-            while ( (n < L) and (a[n] < 128) ):

			
 
				+            while ( (n < L) and (ORD(a[n]) < 128) ): # (a[n] < ExtASCII) ):

			
 
				                 n = n+1

			
 
				             if ( n >= L ):                          # all chars < 128: ASCII coding

			
 
				                 return "ascii"                      # but may also be treated as UTF8!

			
 
				-

			
 
				             w = a[n]

			
 
				 

			
 
				             # UTF8 two byte sequence: leading byte + 1 extension byte

			
 
				-            if w in range(176,207):

			
 
				+            if ORD(w) in range(192,224):

			
 
				                 if ( (n+1 < L)

			
 
				-                 and (a[n+1] in range(128,175))     # valid UTF8 extension byte

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # valid UTF8 extension byte

			
 
				                 ):

			
 
				                     if ((n+2 == L)                  # w is last character

			
 
				-                     or (a[n+2] < 128)              # or next byte is an ASCII char

			
 
				-                     or (a[n+2] in range(176,271))  # or next byte is an UTF8 leading byte

			
 
				+                     or (ORD(a[n+2]) < 128)              # or next byte is an ASCII char

			
 
				+                     or (ORD(a[n+2]) in range(192,244))  # or next byte is an UTF8 leading byte

			
 
				                     ):

			
 
				                         return "utf_8"

			
 
				                 return default

			
 
				 

			
 
				             # UTF8 three byte sequence: leading byte + 2 extension bytes

			
 
				-            if w in range(208,239):

			
 
				+            if ORD(w) in range(224,240):

			
 
				                 if ( (n+2 < L)

			
 
				-                 and (a[n+1] in range(128,175))     # 2 valid UTF8 extension bytes

			
 
				-                 and (a[n+2] in range(128,175))

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # 2 valid UTF8 extension bytes

			
 
				+                 and (ORD(a[n+2]) in range(128,192))

			
 
				                 ):

			
 
				                     if ((n+3 == L)                  # w is last character

			
 
				-                     or (a[n+3] < 128)              # or next byte is ASCII char

			
 
				-                     or (a[n+3] in range(176,271))  # or next byte is UTF8 leading byte

			
 
				+                     or (ORD(a[n+3]) < 128)              # or next byte is ASCII char

			
 
				+                     or (ORD(a[n+3]) in range(192,244))  # or next byte is UTF8 leading byte

			
 
				                     ):

			
 
				                         return "utf_8"

			
 
				                 return default

			
 
				 

			
 
				             # UTF8 four byte sequence: leading byte + 3 extension bytes

			
 
				-            if w in range(240,271):

			
 
				+            if ORD(w) in range(240,244):

			
 
				                 if ( (n+3 < L)

			
 
				-                 and (a[n+1] in range(128,175))     # 3 valid UTF8 extension bytes

			
 
				-                 and (a[n+2] in range(128,175))

			
 
				-                 and (a[n+3] in range(128,175))

			
 
				+                 and (ORD(a[n+1]) in range(128,192))     # 3 valid UTF8 extension bytes

			
 
				+                 and (ORD(a[n+2]) in range(128,192))

			
 
				+                 and (ORD(a[n+3]) in range(128,192))

			
 
				                 ):

			
 
				                     if ((n+4 == L)                  # w is last character

			
 
				-                     or (a[n+4] < 128)              # or next byte is ASCII char

			
 
				-                     or (a[n+4] in range(176,271))  # or next byte is UTF8 leading byte

			
 
				+                     or (ORD(a[n+4]) < 128)              # or next byte is ASCII char

			
 
				+                     or (ORD(a[n+4]) in range(192,244))  # or next byte is UTF8 leading byte

			
 
				                     ):

			
 
				                         return "utf_8"

			
 
				                 return default

			
@@ -265,7 +279,10 @@ class DirectoryReader():
 
				 

			
 
				         # ----------------------------------------------------------------------

			
 
				         # Subroutine readtextfile

			
 
				-        # open as binary and try to guess the encoding:

			
 
				+        # open as binary and try to guess the encoding

			
 
				+        # attention:

			
 
				+        # - Phyton 3: "a" is a binary array

			
 
				+        # - Python 2: "a" is string array!

			
 
				         # ----------------------------------------------------------------------

			
 
				         f = open(filename, 'rb');

			
 
				         a = f.read();

			
@@ -312,16 +329,34 @@ class DirectoryReader():
 
				         text = text.replace("\r","\n")

			
 
				 

			
 
				         # debug:

			
 
				-        #print(filename+" - Coding found = "+coding+" len: "+str(len(text)))

			
 
				-        #f = open(filename+"."+coding,'wb')

			
 
				-        #f.write(text.encode(coding))   write in original coding

			
 
				-        # or:

			
 
				-        #f.write(text.encode("utf-8"))  write as UTF-8: same files in different encodings should result in identical files

			
 
				-        #f.close

			
 
				+        #print(filename+" - Coding found = "+coding+" len: "+str(len(text))+" / "+str(len(a)));

			
 
				+        #self.debugout(filename,text,coding)

			
 
				 

			
 
				         return text

			
 
				+

			
 
				         # end of readtextfile --------------------------------------------------

			
 
				 

			
 
				+    def readfile_org(self,filename):

			
 
				+        f = open(filename, 'rU');

			
 
				+        coding = f.encoding

			
 
				+        text = f.read();

			
 
				+        # getting along with the different string handling of python 2 and 3

			
 
				+        # trying to get along with different encodings to get the tests running

			
 
				+        # on windows and linux

			
 
				+        try:

			
 
				+            text = text.encode(f.encoding)

			
 
				+        except:

			
 
				+            pass

			
 
				+        try:

			
 
				+            text = text.decode('utf-8')

			
 
				+        except:

			
 
				+            pass

			
 
				+        f.close()

			
 
				+

			
 
				+        #self.debugout(filename,text,coding)

			
 
				+

			
 
				+        return text

			
 
				+

			
 
				     def run(self, plugin, directory):

			
 
				 

			
 
				         IS_TEST_MODE = False

			
@@ -346,6 +381,7 @@ class DirectoryReader():
 
				                         ts = time.time()

			
 
				 

			
 
				                         text = self.readtextfile(full_path)

			
 
				+                        #text = self.readfile_org(full_path)

			
 
				                         checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3

			
 
				 

			
 
				                         db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()