注册 登录  
 加关注
查看详情
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

银河军团大本营

光荣的军团,永远的丰碑 <坚持原创>

 
 
 

日志

 
 

文本编码的智能识别(续) - C#版本代码  

2013-02-18 21:02:25|  分类: 软件就是生活 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |

//C#版本核心代码 (将其保存为SmartLoadCS.cs) VS2005测试通过,其他版本未测试

/*******************************************************************************
* 智能识别文本编码(C#版) *
* *
* 功能: 本单元用于智能识别文本文件编码,包括简繁体文本的识别,VS2005测试通 *
* 过 *
* 实现: 2013.1.31 ~ 2013.2.18 实现Delphi版到C#的转换 *
* Blog: dreamisx.blog.163.com *
* EMail: iamdream%yeah.net (% -> @) *
*******************************************************************************/


using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Runtime.InteropServices;

/*---------------------------- Character Encoding -----------------------------

1. UTF-8
Ascii Chars: 00-7F // 1 Bytes = 0xxxxxxx
Multi Bytes: C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx
E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]
2 Bytes Characters: 0000-FFFF

3. UTF-16
0000-D7FF = 0000-D7FF // 2 Bytes
E000-FFFF = E000-FFFF // 2 Bytes
10000-10FFFF = D800-D8FF + DC00-DCFF // 4 Bytes
D800-DFFF // Surrogate

4. UTF-32 = 32 bit unsigned integer of character

5. GB2312-80
Ascii Chars: 00-7F
Simplified Chinese Chars: A1-F7 + A1-FE
Note: A9 + A4-EF ==> Tabs in Chinese Chars

6. Big5
Ascii Chars: 00-7F
Traditional Chinses Chars: A1-F9 + 40-7E
A1-F9 + A1-FE

7. GBK
Ascii Chars: 00-7F
Chinses Chars: 81-FE + 40-7E
81-FE + 80-FE

8. GB18030-2000
Ascii Chars: 00-7F
Chinese Chars (2 Bytes): 81-FE + 40-7E
81-FE + 80-FE
Chinese Chars (4 Bytes): 81-FE + 30-39 + 81-FE + 30-39



null GBK:
A1.40-A0
...
A7.40-A0
AA.A1-FE
...
AF.A1-FE
F8.A1-FE
...
FE.A1-FE

null Big5:
C6.A1-FE
C7.40-FE
C8.40-FE
XX.7F-A0

-----------------------------------------------------------------------------*/

namespace SmartLoad
{
public static class SmartLoadCS
{
// consts
internal const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
internal const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;

// APIs

[DllImport("kernel32.dll", EntryPoint = "LCMapStringA")]
static extern int LCMapStringA(uint Locale, uint dwMapFlags,
byte[] lpSrcStr, int cchSrc, byte[] lpDestStr, int cchDest);

[DllImport("kernel32.dll")]
static extern uint GetUserDefaultLCID();

[DllImport("kernel32.dll")]
static extern int MultiByteToWideChar(uint CodePage, uint dwFlags,
[MarshalAs(UnmanagedType.LPArray)] byte[] lpMultiByteStr,
int cbMultiByte, [Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder lpWideCharStr, int cchWideChar);

[DllImport("kernel32.dll")]
static extern int WideCharToMultiByte(uint CodePage, uint dwFlags,
[MarshalAs(UnmanagedType.LPWStr)] string lpWideCharStr, int cchWideChar,
[MarshalAs(UnmanagedType.LPArray)] Byte[] lpMultiByteStr, int cbMultiByte, IntPtr lpDefaultChar,
IntPtr lpUsedDefaultChar);

/*********************************************************************
* GB & Big5 Convertion
* 貌似以下方法可以在简体系统下正常将简体转为繁体并存成繁体编码文件
* 但不知道在繁体系统下可否正常工作?! 2013.1.31~2013.2.18
*********************************************************************/

private static string UnicodeEncode(byte[] source, uint CodePage)
{
StringBuilder s = new StringBuilder(source.Length);
int len = MultiByteToWideChar(CodePage, 0, source, -1, s, s.Capacity);
return s.ToString();
}

private static byte[] UnicodeDecode(string source, uint CodePage)
{
byte[] ret = new byte[source.Length * 2 + 1];
int len = WideCharToMultiByte(CodePage, 0, source, -1, ret, ret.Length, IntPtr.Zero, IntPtr.Zero);
len--;
byte[] ok = new byte[len];
Array.Copy(ret, ok, len);
return ok;
}


public static byte[] Gb2Big5(byte[] source)
{
byte[] result = new byte[source.Length];
LCMapStringA(GetUserDefaultLCID(), LCMAP_TRADITIONAL_CHINESE, source, source.Length, result, result.Length);
return UnicodeDecode(UnicodeEncode(result, 936), 950);
}

public static byte[] Big52Gb(byte[] source)
{
byte[] tmp = UnicodeDecode(UnicodeEncode(source, 950), 936);
byte[] result = new byte[tmp.Length];
LCMapStringA(GetUserDefaultLCID(), LCMAP_SIMPLIFIED_CHINESE, tmp, tmp.Length, result, result.Length);
return result;
}

/*********************************************************************
* Smart Load
*********************************************************************/

public static bool SysIsSimplifiedChinese()
{
return "zh-CN".Equals(System.Threading.Thread.CurrentThread.CurrentCulture.Name);
}

public static bool SysIsTraditionalChinese()
{
return "zh-TW".Equals(System.Threading.Thread.CurrentThread.CurrentCulture.Name);
}

/*-------------------------------------------------------------------*/

public static int SamplingSize = 4096;
public static bool ExactCompare = false;

public enum CharEncoding {Ansi, Binary, Utf_8, Ucs2_LE, Ucs2_BE, Utf32_LE, Utf32_BE, GB, Big5};

/*-------------------------------------------------------------------*/

private static int CountChineseChars(byte[] buf)
{
int count = 0;
int i = 0;
while (i < buf.Length)
{
if (buf[i] > 0x80)
{
count += 2;
i += 2;
}
else
{
i++;
}
}
return count;
}

private static bool MyCompareChineseStr(byte[] s1, byte[] s2)
{
int difCount = 0;
int i = 0;
int k = 0;
while (i < s1.Length && k < s2.Length)
{
if (s1[i] != s2[k])
{
if (i + 2 < s1.Length && k + 2 < s2.Length)
{
//比较原理:两次转换后,某些字可能会转戌一个'?',以下处理这种情况
if (s1[i + 1] == s2[k] && s1[i + 2] == s2[k + 1])
{
i++;
}
else if (s1[i] == s2[k + 1] && s1[i + 1] == s2[k + 2])
{
k++;
}
else
{
difCount++;
}
}
else
{
difCount++;
}
}
i++;
k++;
}
int cmpLen = ExactCompare ? CountChineseChars(s1) : s1.Length;
return difCount * 100 / Math.Max(1, cmpLen) <= 6; // different <= 6%
}

/*-------------------------------------------------------------------*/

public static CharEncoding TryToDistinguishGBOrBig5(byte[] source)
{
if (SysIsSimplifiedChinese())
{
return MyCompareChineseStr(source, Big52Gb(Gb2Big5(source))) ? CharEncoding.GB : CharEncoding.Big5;
}
else if (SysIsTraditionalChinese())
{
return MyCompareChineseStr(source, Gb2Big5(Big52Gb(source))) ? CharEncoding.Big5 : CharEncoding.GB;
}
return CharEncoding.Ansi;
}

private static bool Maybe3BytesUtf8(byte[] buf, int len, int index)
{
/*
return index +2 < len && buf[index] >= 0xE0 && buf[index] <= 0xEF &&
buf[index +1] >= 0x80 && buf[index +1] <= 0xBF &&
buf[index +2] >= 0x80 && buf[index +2] <= 0xBF;
*/
return index + 2 < len && (buf[index] & 0xF0) == 0xE0 &&
(buf[index + 1] & 0xC0) == 0x80 && (buf[index + 2] & 0xC0) == 0x80;
}

public static CharEncoding GuessCharEncoding(byte[] buf, bool seeGBBig5)
{
int idx = 0;
int maybeGB = 0;
int mayBig5 = 0;
int mayUtf8 = 0;
int maybeLE = 0;
int maybeBE = 0;
int mayBins = 0;
int serZero = 0;
int chsCount = 0;
int utf8Count = 0;
int gbkNulls = 0;
int big5Nulls = 0;
int len = buf.Length;

while (idx < len)
{
if (idx + 1 < len)
{
if (buf[idx] >= 0xA1 && buf[idx] <= 0xA7 && buf[idx + 1] >= 0x40 && buf[idx + 1] <= 0xA0 ||
(buf[idx] >= 0xAA && buf[idx] <= 0xAF || buf[idx] >= 0xF8 && buf[idx] <= 0xFE) &&
buf[idx + 1] >= 0xA1 && buf[idx + 1] <= 0xFE)
{
gbkNulls++;
}
if (buf[idx + 1] >= 0x7F && buf[idx + 1] <= 0xA0 ||
buf[idx] >= 0xC7 && buf[idx] <= 0xC8 && buf[idx + 1] >= 0x40 && buf[idx + 1] <= 0xFE ||
buf[idx] == 0xC6 && buf[idx + 1] >= 0xA1 && buf[idx + 1] <= 0xFE)
{
big5Nulls++;
}
}
if (buf[idx] == 0x00)
{
mayBins++;
if (idx + 1 < len && buf[idx + 1] == 0x00)
{
serZero++;
}
if (idx % 2 == 1)
{
maybeLE++;
}
else
{
maybeBE++;
}
}
else if (buf[idx] >= 1 && buf[idx] <= 8 || buf[idx] == 11 || buf[idx] == 12 ||
buf[idx] >= 14 && buf[idx] <= 31)
{
mayBins++;
}
else if (buf[idx] == 0x80)
{
int iUtf8 = idx;
iUtf8++;
if (iUtf8 + 1 < len && buf[iUtf8] >= 0x80 && buf[iUtf8] <= 0xBF) iUtf8++;
if (Maybe3BytesUtf8(buf, len, iUtf8)){
mayUtf8 += 32;
utf8Count++;
}
}
else if (buf[idx] >= 0x81 && buf[idx] <= 0xBF)
{
if (buf[idx] >= 0x81 && buf[idx] <= 0xA0)
{
maybeGB += 8;
}
else
{
maybeGB += 8;
mayBig5 += 8;
}
chsCount++;
idx++;
int iUtf8 = idx;
if (iUtf8 + 1 < len && buf[iUtf8] >= 0x80 && buf[iUtf8] <= 0xBF) iUtf8++;
if (Maybe3BytesUtf8(buf, len, iUtf8))
{
mayUtf8 += 32;
utf8Count++;
}
}
else if (buf[idx] >= 0xC0 && buf[idx] <= 0xDF)
{
if (idx + 1 < len && buf[idx + 1] >= 0x80 && buf[idx + 1] <= 0xBF)
{
mayUtf8++;
utf8Count++;
chsCount++;
if (buf[idx + 1] >= 0xA1 && buf[idx + 1] <= 0xBF)
{
maybeGB++;
mayBig5++;
}
else
{
maybeGB += 4;
}
}
else
{
maybeGB++;
mayBig5++;
chsCount++;
}
idx++;
}
else if (buf[idx] >= 0xE0 && buf[idx] <= 0xEF)
{
if (idx +2 < len && buf[idx +1] >= 0x80 && buf[idx +1] <= 0xBF && buf[idx +2] >= 0x80 && buf[idx +2] <= 0xBF){
mayUtf8 += 32;
utf8Count++;
}
maybeGB++;
mayBig5++;
chsCount++;
idx++;
}
else if (buf[idx] >= 0xF0 && buf[idx] <= 0xFE)
{
if (buf[idx] >= 0xFA && buf[idx] <= 0xFE) maybeGB += 8;
maybeGB += 8;
mayBig5 += 8;
chsCount++;
idx++;
}
idx++;
}

// set encoding
if (mayBins > 1 || maybeLE > 1 || maybeBE > 1 || mayBins * 8 >= len ||
maybeLE * 8 >= len || maybeBE * 8 >= len)
{
if (mayBins > maybeLE * 2 && mayBins > maybeBE * 2 || serZero > 10 ||
serZero * 8 > Math.Max(maybeBE, maybeLE))
{
return CharEncoding.Binary;
}
return (maybeLE >= maybeBE) ? CharEncoding.Ucs2_LE : CharEncoding.Ucs2_BE;
}
else if (maybeGB >= mayUtf8 || mayBig5 >= mayUtf8 || chsCount >= utf8Count * 2)
{
int ratio = (maybeGB - mayBig5) * 100 / Math.Max(1, Math.Max(maybeGB, mayBig5));
if (Math.Abs(ratio) <= 5)
{
if (gbkNulls > big5Nulls)
{
return CharEncoding.Big5;
}
else if (gbkNulls < big5Nulls)
{
return CharEncoding.GB;
}
else if (seeGBBig5 && (maybeGB > 0 || mayBig5 > 0))
{
return TryToDistinguishGBOrBig5(buf);
}
}
else
{
return ratio > 0 ? CharEncoding.GB : CharEncoding.Big5;
}
}
else if (mayUtf8 > 0)
{
return CharEncoding.Utf_8;
}
return CharEncoding.Ansi;
}

public static CharEncoding GuessCharEncoding(Stream aStream, bool seeGBBig5)
{
byte[] buf = new byte[Math.Min(SamplingSize, aStream.Length - aStream.Position)];
aStream.Read(buf, 0, buf.Length);
return GuessCharEncoding(buf, seeGBBig5);
}

/*-------------------------------------------------------------------*/

private static bool SameBytes(byte[] buf, int count, params byte[] vals)
{
for (int i = 0; i < count; i++)
{
if (buf[i] != vals[i]) return false;
}
return true;
}

private static CharEncoding DoTryToDecode(Stream source, out StreamReader target, bool seeGBBig5)
{
CharEncoding result = GuessCharEncoding(source, seeGBBig5);
source.Seek(0, SeekOrigin.Begin);
switch (result)
{
case CharEncoding.Utf_8:
target = new StreamReader(source, Encoding.UTF8);
break;
case CharEncoding.Ucs2_BE:
target = new StreamReader(source, Encoding.BigEndianUnicode);
break;
case CharEncoding.Ucs2_LE:
target = new StreamReader(source, Encoding.Unicode);
break;
case CharEncoding.Utf32_BE:
target = new StreamReader(source, new UTF32Encoding(true, true));
break;
case CharEncoding.Utf32_LE:
target = new StreamReader(source, new UTF32Encoding(false, true));
break;
case CharEncoding.GB:
target = new StreamReader(source, Encoding.GetEncoding("GB18030"));
break;
case CharEncoding.Big5:
target = new StreamReader(source, Encoding.GetEncoding("Big5"));
break;
default:
target = new StreamReader(source, Encoding.Default);
break;
}
return result;
}

public static CharEncoding SmartLoadTextFileToStream(string fileName,
bool tryToDecode, bool seeGBBig5, out StreamReader aStreamReader)
{
aStreamReader = null;
FileStream fs = File.OpenRead(fileName);
if (fs.CanRead)
{
byte[] bom = new byte[4];
int reads = fs.Read(bom, 0, 4);
if (reads >= 2)
{
fs.Seek(0, SeekOrigin.Begin);
if (reads >= 4)
{
//UTF32, Big-Endian
if (SameBytes(bom, 4, 0x00, 0x00, 0xFE, 0xFF))
{
aStreamReader = new StreamReader(fs, new UTF32Encoding(true, true));
return CharEncoding.Utf32_BE;
}
//UTF32, Little-Endian
if (SameBytes(bom, 4, 0xFF, 0xFE, 0x00, 0x00))
{
aStreamReader = new StreamReader(fs, new UTF32Encoding(false, true));
return CharEncoding.Utf32_LE;
}
}
//UTF8
if (reads >= 3 && SameBytes(bom, 3, 0xEF, 0xBB, 0xBF))
{
aStreamReader = new StreamReader(fs, Encoding.UTF8);
return CharEncoding.Utf_8;
}
//UTF16, Big-Endian
if (SameBytes(bom, 2, 0xFE, 0xFF))
{
aStreamReader = new StreamReader(fs, Encoding.BigEndianUnicode);
return CharEncoding.Ucs2_BE;
}
//UTF16, Little-Endian
if (SameBytes(bom, 2, 0xFF, 0xFE))
{
aStreamReader = new StreamReader(fs, Encoding.Unicode);
return CharEncoding.Ucs2_LE;
}
if (tryToDecode)
{
return DoTryToDecode(fs, out aStreamReader, seeGBBig5);
}
}
}
fs.Seek(0, SeekOrigin.Begin);
aStreamReader = new StreamReader(fs, Encoding.Default);
return CharEncoding.Ansi;
}

}
}

  使用示例:

private void OpenFile(string fileName)
{
StreamReader sr = null;
SmartLoadCS.CharEncoding encode = SmartLoadCS.SmartLoadTextFileToStream(fileName, true, true, out sr);
if (sr != null)
{
TextBox1.Text = sr.ReadToEnd();
this.Text = string.Format("Encoding Demo - {0} <{1}>", (fileName), (Enum.GetName(typeof(SmartLoadCS.CharEncoding), encode)));
sr.Close();
}
else
{
this.Text = string.Format("Encoding Demo(Open Error) - {0}", (fileName));
}
}

// 附:关联文章

// 文本编码的智能识别

// 简繁体编码识别原理

// 文本编码的智能识别(续) - 完整代码部分

// 文本编码的智能识别(续) - Unicode版本代码

  评论这张
 
阅读(1461)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2018