|
@@ -137,25 +137,29 @@ class DirectoryReader():
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
-
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
+
|
|
|
def checkforUTF8(a,default="latin_1"):
|
|
|
L = len(a)
|
|
|
n = 0
|
|
@@ -166,42 +170,46 @@ class DirectoryReader():
|
|
|
|
|
|
w = a[n]
|
|
|
|
|
|
- if w in range(176,207):
|
|
|
+
|
|
|
+ if w in range(176,207):
|
|
|
if ( (n+1 < L)
|
|
|
and (a[n+1] in range(128,175))
|
|
|
):
|
|
|
- if ((n+2 == L)
|
|
|
+ if ((n+2 == L)
|
|
|
or (a[n+2] < 128)
|
|
|
or (a[n+2] in range(176,271))
|
|
|
):
|
|
|
return "utf_8"
|
|
|
return default
|
|
|
|
|
|
- if w in range(208,239):
|
|
|
+
|
|
|
+ if w in range(208,239):
|
|
|
if ( (n+2 < L)
|
|
|
and (a[n+1] in range(128,175))
|
|
|
and (a[n+2] in range(128,175))
|
|
|
):
|
|
|
- if ((n+3 == L)
|
|
|
+ if ((n+3 == L)
|
|
|
or (a[n+3] < 128)
|
|
|
or (a[n+3] in range(176,271))
|
|
|
):
|
|
|
return "utf_8"
|
|
|
return default
|
|
|
|
|
|
- if w in range(240,271):
|
|
|
+
|
|
|
+ if w in range(240,271):
|
|
|
if ( (n+3 < L)
|
|
|
and (a[n+1] in range(128,175))
|
|
|
and (a[n+2] in range(128,175))
|
|
|
and (a[n+3] in range(128,175))
|
|
|
):
|
|
|
- if ((n+4 == L)
|
|
|
+ if ((n+4 == L)
|
|
|
or (a[n+4] < 128)
|
|
|
or (a[n+4] in range(176,271))
|
|
|
):
|
|
|
return "utf_8"
|
|
|
return default
|
|
|
|
|
|
+
|
|
|
return default;
|
|
|
|
|
|
|
|
@@ -222,11 +230,14 @@ class DirectoryReader():
|
|
|
coding = "utf_32_be"
|
|
|
elif a.startswith(b'\xef\xbb\xbf'):
|
|
|
coding = "utf_8_sig"
|
|
|
+
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
else:
|
|
|
coding = checkforUTF8(a)
|
|
@@ -235,7 +246,7 @@ class DirectoryReader():
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
+
|
|
|
text = a.decode(coding,'replace')
|
|
|
|
|
|
|
|
@@ -245,7 +256,7 @@ class DirectoryReader():
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
+
|
|
|
|
|
|
|
|
|
|