2 years ago · 5e948c4ee9
--- a/metrixpp/ext/std/tools/collect.py
+++ b/metrixpp/ext/std/tools/collect.py
@@ -132,9 +132,58 @@ class DirectoryReader():
 
																     def readtextfile(self,filename):

															
 
																         """ Read a text file and try to detect the coding

															
 
																-        """

															
 
																-        # Subroutine - Check for UTF8:

															
 
																+            Since we examine program code text files we can assume the following:

															
 
																+            - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4

															
 
																+              byte, starting on 1, 2 or 4 byte boundaries (depending on

															
 
																+              1, 2 or 4 byte coding)

															
 
																+            - There should at least 1 line terminated with an end of line

															
 
																+              character, i.e. \n or \r of the respective length (1,2 or 4 byte)

															
 
																+            - Program code consists of only ASCII chars, i.e. code < 128

															
 
																+            - Non ASCII chars should only appear in string literals and comments

															
 
																+

															
 
																+            Though especially in the case of an 8 bit coding it does not matter

															
 
																+            which code page to use: Metric analysis is done on program code

															
 
																+            which is pure ASCII; string literals and comments are only recognized

															
 
																+            as such but not interpreted, though it doesn't matter if they contain

															
 
																+            non-ASCII chars whichever code page is used.

															
 
																+

															
 
																+            Note the decoder's different behavior for the "utf-nn" identifiers:

															
 
																+            - .decode("utf-32") / .decode("utf-16"):       preceding BOM is skipped

															
 
																+            - with suffix "..-be" or "..-le" respectively: preceding BOM is preserved

															
 
																+            but

															
 
																+            - .decode("utf-8"):     preceding BOM is preserved

															
 
																+            - .decode("utf-8-sig"): preceding BOM is skipped

															
 
																+        """

															
 
																+        # Methods to check for various UTF variants without BOM:

															
 
																+        # Since UTF16/32 codings are recommended to use a BOM these methods

															
 
																+        # shouldn't be necessary but may be useful in certain cases.

															
 
																+        def checkforUTF32_BE(a):

															
 
																+            if ( (len(a) % 4) != 0 ): return False

															
 
																+            n = a.find(b'\x00\x00\x00\n')

															
 
																+            if n < 0:

															
 
																+                n = a.find(b'\x00\x00\x00\r')

															
 
																+            return (n >= 0) and ((n % 4) == 0)

															
 
																+        def checkforUTF32_LE(a):

															
 
																+            if ( (len(a) % 4) != 0 ): return False

															
 
																+            n = a.find(b'\n\x00\x00\x00')

															
 
																+            if n < 0:

															
 
																+                n = a.find(b'\r\x00\x00\x00')

															
 
																+            return (n >= 0) and ((n % 4) == 0)

															
 
																+        def checkforUTF16_BE(a):

															
 
																+            if ( (len(a) % 2) != 0 ): return False

															
 
																+            n = a.find(b'\x00\n')

															
 
																+            if n < 0:

															
 
																+                n = a.find(b'\x00\r')

															
 
																+            return (n >= 0) and ((n % 2) == 0)

															
 
																+        def checkforUTF16_LE(a):

															
 
																+            if ( (len(a) % 2) != 0 ): return False

															
 
																+            n = a.find(b'\n\x00')

															
 
																+            if n < 0:

															
 
																+                n = a.find(b'\r\x00')

															
 
																+            return (n >= 0) and ((n % 2) == 0)

															
 
																+

															
 
																+        # Method to check for UTF8 without BOM:

															
 
																         # "a" is the textfile represented as a simple byte array!

															
 
																         # Find first char with code > 127:

															
 
																         #

															
@@ -152,7 +201,8 @@ class DirectoryReader():
 
																         #   If a valid UTF8 sequence is found but in fact the text is some sort

															
 
																         #   of 8 bit OEM coding this may be coincidentally a sequence of 8 bit

															
 
																         #   OEM chars. This indeed seems very unlikely but may happen...

															
 
																-        #   Otherwise the whole text has to be examined for UTF8 sequences.

															
 
																+        #   Even though the whole text was examined for UTF8 sequences: every

															
 
																+        #   valid UTF8 sequence found may also be a sequence of OEM chars!

															
 
																         #

															
 
																         # 3 Code is not a valid UTF8 leading byte: range(128,175) or range(272,255)

															
 
																         #   In this case coding is some sort of 8 bit OEM coding. Since we don't

															
@@ -213,30 +263,37 @@ class DirectoryReader():
 
																             return default;

															
 
																           # end of checkforUTF8 ------------------------------------------------

															
 
																+        # ----------------------------------------------------------------------

															
 
																         # Subroutine readtextfile

															
 
																         # open as binary and try to guess the encoding:

															
 
																+        # ----------------------------------------------------------------------

															
 
																         f = open(filename, 'rb');

															
 
																         a = f.read();

															
 
																         f.close()

															
 
																         # check for codings with BOM:

															
 
																-        if a.startswith(b'\xff\xfe'):

															
 
																-            coding = "utf_16_le"

															
 
																-        elif a.startswith(b'\xfe\xff'):

															
 
																-            coding = "utf_16_be"

															
 
																-        elif a.startswith(b'\xff\xfe\x00\x00'):

															
 
																-            coding = "utf_32_le"

															
 
																-        elif a.startswith(b'\x00\x00\xfe\xff'):

															
 
																-            coding = "utf_32_be"

															
 
																+        # Consider the order: Check for UTF32 first!

															
 
																+        if  (a.startswith(b'\xff\xfe\x00\x00')

															
 
																+          or a.startswith(b'\x00\x00\xfe\xff')):

															
 
																+            coding = "utf_32"       # no suffix _be/_le --> decoder skips the BOM

															
 
																+        elif (a.startswith(b'\xff\xfe')

															
 
																+           or a.startswith(b'\xfe\xff')):

															
 
																+            coding = "utf_16"       # no suffix _be/_le --> decoder skips the BOM

															
 
																         elif a.startswith(b'\xef\xbb\xbf'):

															
 
																             coding = "utf_8_sig"

															
 
																         # elif: there are some other codings with BOM - feel free to add them here

															
 
																-        # elif: check for UTF variants without BOM:

															
 
																-        #       at this point one may try to determine UTF16 or UTF32 codings

															
 
																-        #       without a BOM but this should not happen since for these codings

															
 
																-        #       a BOM is recommended.

															
 
																+        # check for UTF variants without BOM:

															
 
																+        # Consider the order: Check for UTF32 first!

															
 
																+        elif checkforUTF32_BE(a):

															
 
																+            coding = "utf_32_be"

															
 
																+        elif checkforUTF32_LE(a):

															
 
																+            coding = "utf_32_le"

															
 
																+        elif checkforUTF16_BE(a):

															
 
																+            coding = "utf_16_be"

															
 
																+        elif checkforUTF16_LE(a):

															
 
																+            coding = "utf_16_le"

															
 
																         # So finally we only have to look for UTF8 without BOM:

															
 
																         else:

															
@@ -257,7 +314,9 @@ class DirectoryReader():
 
																         # debug:

															
 
																         #print(filename+" - Coding found = "+coding+" len: "+str(len(text)))

															
 
																         #f = open(filename+"."+coding,'wb')

															
 
																-        #f.write(text.encode(coding))

															
 
																+        #f.write(text.encode(coding))   write in original coding

															
 
																+        # or:

															
 
																+        #f.write(text.encode("utf-8"))  write as UTF-8: same files in different encodings should result in identical files

															
 
																         #f.close

															
 
																         return text

															
@@ -285,6 +344,7 @@ class DirectoryReader():
 
																                     else:

															
 
																                         logging.info("Processing: " + norm_path)

															
 
																                         ts = time.time()

															
 
																+

															
 
																                         text = self.readtextfile(full_path)

															
 
																                         checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3