'引用 ADO 2.5 以上版本 '引用 DAO 3.6 ' ReadText_TextStream 函数只允许在 access 环境下使用, ' 需要另外建立表2,3个字段。表2:(id(自动编号),xml(备注),path(文本))
'用VBA的Open来读取文本中的二进制流到数组 Function ReadText_Open() Dim strXml As String Dim strPath As String Dim tb() As Byte Dim i As Long Dim lngFileNumber As Long Dim lngFileLen As Long 'strPath = "J:\MyTemp\ut\其他的UTF8文本.txt" strPath = "J:\MyProgram\DiskClerk\CatalogLibrary\MoveHD-03.xml" lngFileLen = FileLen(strPath) lngFileNumber = FreeFile Open strPath For Binary access Read Shared As #lngFileNumber Len = 10000 '注意,这里并没有为超大文件进行缓冲分割,如果文件太大(超过10kB)可能造成问题 ReDim tb(lngFileLen - 1) As Byte Get #lngFileNumber, , tb Debug.Print UBound(tb) + 1 Close #lngFileNumber '如果文件体积很大,这段组织文本将耗时非常多,暂时屏蔽了 'For i = 0 To UBound(tb) Step 2 ' strXml = strXml & ChrB(tb(i)) & ChrB(tb(i + 1)) ' DoEvents 'Next 'Debug.Print "xml内容", strXml AnalyseBytes tb
End Function
'(错误的)用FSO.TextStream读取文本数据 Function ReadText_TextStream() Dim strXml As String Dim strPath As String Dim tb() As Byte Dim i As Long Set fs = CreateObject("Scripting.FileSystemObject") '注意替换以下这行,分别取 10个字节的文本和13个字节的文本,你会发现在 #1处Debug出来的长度并不等于文件长度。 'strPath = "J:\MyTemp\ut\10个字节的UTF8文本.txt" strPath = "J:\MyTemp\ut\13个字节的UTF8文本.txt" Set a = fs.OpenTextFile(strPath, 1) strXml = a.readall a.Close Debug.Print "xml内容", strXml Set a = fs.OpenTextFile(strPath, 1) tb = a.readall '#1 Debug.Print "fso textstream 直接读取转换为数组 与 读取字符串长度对比", UBound(tb) + 1, LenB(strXml) a.Close Dim rst As DAO.Recordset Set rst = CurrentDb.OpenRecordset("select * from 表2") rst.AddNew rst("xml") = strXml rst("path") = strPath rst.Update rst.Close AnalyseBytes tb
Debug.Print "长度:", UBound(tb) + 1, LenB(strXml)
End Function
Function ReadText_Stream() Dim s As New ADODB.Stream Dim strPath As String Dim strXml As String Dim b() As Byte Dim i As Long 'strPath = "J:\MyTemp\ut\其他的UTF8文本.txt" strPath = "J:\MyProgram\DiskClerk\CatalogLibrary\MoveHD-03.xml" s.Type = adTypeBinary s.Open s.LoadFromFile strPath b = s.Read() Debug.Print "stream 与 数组 长度对比:", s.Size, UBound(b) + 1 AnalyseBytes b End Function
'转换编码,将 UTF-8 编码转换为当前 access 系统编码 Public Function AnalyseBytes(ByRef bts() As Byte) Dim i As Long Dim strDecode As String Dim strChsWord As String For i = 0 To UBound(bts) If i Mod 10000 = 0 Then Debug.Print i, Round(i / UBound(bts), 2) * 100 & "%" Debug.Print strDecode strDecode = "" End If If i <= 2 Then '打印前3个字节确定其为 UTF8 Debug.Print "UTF-8文件的标记是3个字节:&HEF、&HBB 和 &HBF", Hex(bts(i)) Else 'Debug.Print N10toC62(bts(i), 2) If bts(i) >= 0 And bts(i) <= 127 Then 'Debug.Print N10toC62(bts(i), 2), ChrW(bts(i)) strDecode = strDecode & ChrW(bts(i)) Else If bts(i) >= 224 And bts(i) <= 239 Then '表明是3字节汉字的首字节 '码长16:如“汉”11100110 10110001 10001001,具体值:0110 110001 001001 strChsWord = ChrW(CDbl((bts(i) - 224)) * 4096 + CDbl((bts(i + 1) - 128) * 64) + CDbl(bts(i + 2) - 128)) strDecode = strDecode & strChsWord DoEvents 'Debug.Print N10toC62(bts(i), 2), strChsWord 'Debug.Print N10toC62(bts(i + 1), 2) 'Debug.Print N10toC62(bts(i + 2), 2) i = i + 2 ElseIf bts(i) >= 128 And bts(i) <= 191 Then '表明是各种的后续字节 ElseIf bts(i) >= 192 And bts(i) <= 223 Then '表明是2字节字符的首字节 '码长11: "1111 111111" strChsWord = ChrW(CDbl((bts(i) - 192)) * 2 ^ 6 + CDbl((bts(i + 1) - 128))) strDecode = strDecode & strChsWord DoEvents i = i + 1 Else Err.Raise 981, "", "bytes(" & i & ") = " & bts(i) & ", " & N10toC62(192, 2) End If End If End If Next Debug.Print strDecode
' Debug.Print "utf8 3字节汉字首字节最低值", C62ToN10("11100000", 2) ' Debug.Print "utf8 3字节汉字首字节最高值", C62ToN10("11101111", 2) ' Debug.Print "utf8 3字节汉字后续节最低值", C62ToN10("10000000", 2) ' Debug.Print "utf8 3字节汉字后续节最高值", C62ToN10("10111111", 2) ' Debug.Print "utf8 2字节字符首字节最低值", C62ToN10("11000000", 2) ' Debug.Print "utf8 2字节字符首字节最高值", C62ToN10("11011111", 2) ' Debug.Print "utf8 ASCII码最低值", C62ToN10("00000000", 2) ' Debug.Print "utf8 ASCII码最低值", C62ToN10("01111111", 2) ' utf8 3字节汉字首字节最低值 224 ' utf8 3字节汉字首字节最高值 239 ' utf8 3字节汉字后续节最低值 128 ' utf8 3字节汉字后续节最高值 191 ' utf8 2字节字符首字节最低值 192 ' utf8 2字节字符首字节最高值 223 ' utf8 ASCII码最低值 0 ' utf8 ASCII码最低值 127
End Function
'用于对临时数据进行解释 Function Explain() 'Debug.Print Asc("新") 'Debug.Print Asc("建") 'Debug.Print Hex(24), Hex(149), Hex(14), Hex(99) 'Debug.Print &HEF 'Debug.Print N10toC62(61, 2), N10toC62(119, 2) 'Debug.Print "UTF-8文件的标记是3个字节:&HEF、&HBB 和 &HBF", Hex(C62ToN10("11101111", 2)), Hex(C62ToN10("10111011", 2)), Hex(C62ToN10("10111111", 2)) 'Debug.Print ChrW(C62ToN10(Right("11100110", 4) & Right("10110001", 6) & Right("10001001", 6), 2)), AscW("汉") 'Debug.Print ChrW(C62ToN10(Right("1100001", 5) & Right("1100001", 6) & Right("10001001", 6), 2)), AscW("汉") '注意最后一个参数,对16进制进行二进制编码得到 111001110001011,长度为15码位,应该是16个码位,前面补0 'Debug.Print "王", AscW("王"), Hex(AscW("王")), N10toC62(&H73, 2) & N10toC62(&H8B, 2), "0111 001110 001011"
Debug.Print "utf8 3字节汉字首字节最低值", C62ToN10("11100000", 2) Debug.Print "utf8 3字节汉字首字节最高值", C62ToN10("11101111", 2) Debug.Print "utf8 3字节汉字后续节最低值", C62ToN10("10000000", 2) Debug.Print "utf8 3字节汉字后续节最高值", C62ToN10("10111111", 2) Debug.Print "utf8 2字节字符首字节最低值", C62ToN10("11000000", 2) Debug.Print "utf8 2字节字符首字节最高值", C62ToN10("11011111", 2) Debug.Print "utf8 ASCII码最低值", C62ToN10("00000000", 2) Debug.Print "utf8 ASCII码最低值", C62ToN10("01111111", 2) 'Debug.Print "utf8 汉字首字节值", C62ToN10("11100001", 2), C62ToN10("11100001", 2) - C62ToN10("11100000", 2), C62ToN10("1000000000000", 2), C62ToN10("1000000", 2), ChrW(C62ToN10("0111001110001011", 2)) 'Debug.Print ChrW((C62ToN10("11100110", 2) - 224) * 4096 + (C62ToN10("10110001", 2) - 128) * 64 + C62ToN10("10001001", 2) - 128), AscW("汉") End Function
Function C62ToN10(ByVal strA As String, Optional ByVal bt As Byte) As Double '本函数用于将 2 8 16 36 62 进制字符串转换为 10 进制数值 '请注意,本函数的输入参数是区分大小写的,36进制以及以下,应该全部转换为大写 '下列进制,如果有必要,可以扩展到整个字符集, '也就是你只要输入一个字符,就可以代表上万位 If bt < 2 Or bt > 62 Then bt = 16 '默认为 16 进制 End If '2进制 0-1 '8进制 0-7 可以用 clng("&O71") 代替 '16进制 0-9 A-F 可以用 clng("&Hf1") 代替 '36进制 0-9 A-Z '62进制 0-9 A-Z a-z '都不对,就用16进制,如果输入数据不符合要求,则出错 If bt <= 36 Then strA = UCase(strA) '小于等于 36 进值时应该全部转换为大写 End If Dim b As Long Dim b1 As String Dim c As Double Dim l As Integer Dim i As Integer l = Len(strA) For i = 1 To l b1 = Mid(strA, i, 1) Select Case Asc(b1) Case 48 To 57 b = CLng(b1) Case 65 To 90 b = Asc(b1) - 55 Case 97 To 122 b = Asc(b1) - 61 End Select c = c + b * bt ^ (l - 1) l = l - 1 Next C62ToN10 = c End Function
Function N10toC62(ByVal b As Long, Optional ByVal bt As Byte) As String '以下函数将10进制数值根据要求转换为 '2 8 16 36 62 进制字符串 '请注意,本函数的输出结果是区分大小写的
If bt < 2 Or bt > 62 Then bt = 16 '默认为 16 进制 End If '2进制 0-1 '8进制 0-7 可以用 OCT 函数代替 '16进制 0-9 A-F 可以用 HEX 函数代替 '36进制 0-9 A-Z '62进制 0-9 A-Z a-z '都不对,就用16进制,如果输入数据不符合要求,则出错 Dim a As Long Dim a1 As String Dim s As String Do a = b Mod bt Select Case a Case 0 To 9 a1 = CStr(a) Case 10 To 35 a1 = Chr(a + 55) Case 36 To 61 a1 = Chr(a + 61) End Select s = a1 & s b = b \ bt Loop Until b = 0 N10toC62 = s End Function |