注册 登录  
 加关注
查看详情
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

银河军团大本营

光荣的军团,永远的丰碑 <坚持原创>

 
 
 

日志

 
 

文本编码的智能识别(续) - 完整代码部分  

2012-03-13 17:38:02|  分类: 软件就是生活 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |
//核心代码 (将其保存为SmartLoadFile.pas)  只支持D5~D2007  

unit SmartLoadFile;

{*******************************************************************************
* 智能识别文本编码 *
* *
* 功能: 本单元用于智能识别文本文件编码,包括简繁体文本的识别,支持D5-D2007 *
* 实现: 2012.3.10 ~ 2012.3.15 完成Ansi(GB/Big5)/Unicode的识别功能 *
* 2012.3.18 ~ 2012.3.19 完成UTF-16与二进制文件的区分功能 *
* 2012.6.2 ~ 2012.6.3 改进简繁体的识别方法,提高了识别率 *
* Blog: dreamisx.blog.163.com * * EMail: iamdream%yeah.net (% -> @) *
*******************************************************************************}

interface

{$IFDEF UNICODE}Sorry, not Support Unicode Version!{$ENDIF}

{$IFDEF VER150} //消除D7警告
{$WARN UNIT_PLATFORM OFF}
{$WARN UNSAFE_CODE OFF}
{$WARN UNSAFE_TYPE OFF}
{$WARN UNSAFE_CAST OFF}
{$ENDIF}

uses
Windows, Messages, SysUtils, Classes, Math, GB2BigEx
{$IFDEF VER130}, Unicode{$ENDIF};

type
TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5);
TCharConvertProc = function (const S: string): string;


function SysIsSimplifiedChinese: Boolean;
function SysIsTraditionalChinese: Boolean;
function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;
function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding; overload;
function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; overload;

procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc;
AExactCompare: Boolean = False; ASamplingSize: Integer = 4096);
function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream;
TryToDecode: Boolean = True; SeeGBBig5: Boolean = True): TCharEncoding;

implementation

{---------------------------- Character Encoding -----------------------------

1. UTF-8
Ascii Chars: 00-7F // 1 Bytes = 0xxxxxxx
Multi Bytes: C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx
E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]
2 Bytes Characters: 0000-FFFF

3. UTF-16
0000-D7FF = 0000-D7FF // 2 Bytes
E000-FFFF = E000-FFFF // 2 Bytes
10000-10FFFF = D800-D8FF + DC00-DCFF // 4 Bytes
D800-DFFF // Surrogate

4. UTF-32 = 32 bit unsigned integer of character

5. GB2312-80
Ascii Chars: 00-7F
Simplified Chinese Chars: A1-F7 + A1-FE
Note: A9 + A4-EF ==> Tabs in Chinese Chars

6. Big5
Ascii Chars: 00-7F
Traditional Chinses Chars: A1-F9 + 40-7E
A1-F9 + A1-FE

7. GBK
Ascii Chars: 00-7F
Chinses Chars: 81-FE + 40-7E
81-FE + 80-FE

8. GB18030-2000
Ascii Chars: 00-7F
Chinese Chars (2 Bytes): 81-FE + 40-7E
81-FE + 80-FE
Chinese Chars (4 Bytes): 81-FE + 30-39 + 81-FE + 30-39



null GBK:
A1.40-A0
...
A7.40-A0
AA.A1-FE
...
AF.A1-FE
F8.A1-FE
...
FE.A1-FE

null Big5:
C6.A1-FE
C7.40-FE
C8.40-FE
XX.7F-A0

-----------------------------------------------------------------------------}

var
ExactCompare: Boolean = False; // for exact convert GB / Big5
SamplingSize: Integer = 4096; // for guessing GB / Big5
GB2Big5Proc: TCharConvertProc;
Big52GBProc: TCharConvertProc;


{ Internal functions of transfering text to ANSI }

procedure DoUtf8ToAnsi(Source, Target: TStream);
var
buf: string;
ret: string;
begin
SetLength(buf, Source.Size - Source.Position);
Source.Read(PChar(buf)^, Length(buf));
ret := Utf8ToAnsi(buf);
Target.Write(PChar(ret)^, Length(ret));
end;

procedure DoLittleEndianUnicode16ToAnsi(Source, Target: TStream);
const //UCS2 ? (Cannot process 4 Bytes' Chars!!)
cBufSize = 4096;
var
buf: array[1..cBufSize] of Char;
pIn: PWideChar;
readSize: Longint;
sOut: string;
begin
repeat
readSize := Source.Read(buf, cBufSize);
if readSize > 0 then begin
pIn := PWideChar(@buf[1]);
sOut := WideCharLenToString(pIn, readSize shr 1);
Target.Write(Pointer(sOut)^, Length(sOut));
end;
until readSize < cBufSize;
end;

procedure DoBigEndianUnicode16ToAnsi(Source, Target: TStream);
const //UCS2 ? (Cannot process 4 Bytes' Chars!!)
cBufSize = 4096;
var
buf: array[1..cBufSize] of Char;
pIn: PWideChar;
readSize: Longint;
sOut: string;
c: Char;
i: Longint;
begin
repeat
readSize := Source.Read(buf, cBufSize);
if readSize > 0 then begin
pIn := PWideChar(@buf[1]);
// adjust byte order
i := 1;
while i <= readSize do begin
c := buf[i];
buf[i] := buf[i +1];
buf[i +1] := c;
Inc(i, 2);
end;
sOut := WideCharLenToString(pIn, readSize shr 1);
Target.Write(Pointer(sOut)^, Length(sOut));
end;
until readSize < cBufSize;
end;

procedure DoGBToAnsi(Source, Target: TStream);
var
S: string;
begin
//If System is Traditional Chinese, Convert it.
if SysIsTraditionalChinese() then begin
SetLength(S, Source.Size);
Source.Read(PChar(S)^, Length(S));
S := Gb2Big5Proc(S);
Target.Write(PChar(S)^, Length(S));
end else begin
Target.CopyFrom(Source, Source.Size);
end;
end;

procedure DoBig5ToAnsi(Source, Target: TStream);
var
S: string;
begin
//If System is Simplified Chinese, Convert it.
if SysIsSimplifiedChinese() then begin
SetLength(S, Source.Size);
Source.Read(PChar(S)^, Length(S));
S := Big52GbProc(S);
Target.Write(PChar(S)^, Length(S));
end else begin
Target.CopyFrom(Source, Source.Size);
end;
end;

{ Count Chinese Characters }

function CountChineseChars(const S: string): Integer;
var
i: Integer;
begin
Result := 0;
i := 1;
while i <= Length(S) do begin
if S[i] > #$80 then begin
Inc(Result, 2);
Inc(i, 2);
end else begin
Inc(i);
end;
end;
end;

//....................... functions of GB / Big5 .............................

function SysIsSimplifiedChinese: Boolean;
begin
Result := (SysLocale.PriLangID = LANG_CHINESE) and
(SysLocale.SubLangID in [SUBLANG_CHINESE_SIMPLIFIED, SUBLANG_CHINESE_SINGAPORE]);
end;

function SysIsTraditionalChinese: Boolean;
begin
Result := (SysLocale.PriLangID = LANG_CHINESE) and
(SysLocale.SubLangID in [SUBLANG_CHINESE_TRADITIONAL, SUBLANG_CHINESE_HONGKONG]);
end;

function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;

function MyCompareChineseStr(const s1, s2: string): Boolean;
var
difCount: Integer;
i, k: Integer;
cmpLen: Integer;
begin
difCount := 0;
i := 1;
k := 1;
while (i <= Length(s1)) and (k <= Length(s2)) do begin
if s1[i] <> s2[k] then begin
if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin
//比较原理:两次转换后,某些字可能会转戌一个'?',以下处理这种情况
if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin
Inc(i);
end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin
Inc(k);
end else begin
Inc(difCount);
end;
end else begin
Inc(difCount);
end;
end;
Inc(i);
Inc(k);
end;
if ExactCompare then begin
cmpLen := CountChineseChars(S);
end else begin
cmpLen := Length(S);
end;
Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6%
end;

begin
Result := ceAnsi;
if SysLocale.PriLangID = LANG_CHINESE then begin
case SysLocale.SubLangID of
SUBLANG_CHINESE_SIMPLIFIED,
SUBLANG_CHINESE_SINGAPORE: begin
if MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin
Result := ceGB;
end else begin
Result := ceBig5;
end;
end;
SUBLANG_CHINESE_TRADITIONAL,
SUBLANG_CHINESE_HONGKONG: begin
if MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin
Result := ceBig5;
end else begin
Result := ceGB;
end;
end;
end;
end;
end;

//............................................................................
// It can distinguish encoding is UTF8 or Ansi,
// but may not distinguish encoding is GB or Big5,
// so it need to process GB / Big5 specially

function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding;
var
len: Longint;

function Maybe3BytesUtf8(Index: Integer): Boolean;
begin
Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and
(buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]);
end;

var
idx: Longint;
iUtf8: Longint;
maybeGB: Integer; //GB2312/GBK/GB18030
mayBig5: Integer; //Big5
mayUtf8: Integer; //Utf-8
maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian
maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian
mayBins: Integer; //Binary File Chars ?
serZero: Integer;
ratio: Integer;
chsCount: Integer;
utf8Count: Integer;
gbkNulls: Integer;
big5Nulls: Integer;
begin
Result := ceAnsi;
maybeGB := 0;
mayBig5 := 0;
mayUtf8 := 0;
maybeLE := 0;
maybeBE := 0;
mayBins := 0;
serZero := 0;
chsCount := 0;
utf8Count := 0;
gbkNulls := 0;
big5Nulls := 0;
len := Length(buf);
idx := 1;
while idx <= len do begin
if idx < len then begin
if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or
(buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(gbkNulls);
end;
if (buf[idx +1] in [#$7F..#$A0]) or
(buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or
(buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(big5Nulls);
end;
end;
case buf[idx] of
#0: begin
Inc(mayBins);
if (idx < len) and (buf[idx +1] = #0) then begin
Inc(serZero);
end;
if (idx mod 2) = 0 then begin
Inc(maybeLE);
end else begin
Inc(maybeBE);
end;
end;
#1..#8, #11, #12, #14..#31: begin
Inc(mayBins);
end;
#$80: begin
iUtf8 := idx;
Inc(iUtf8);
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$81..#$BF: begin
if buf[idx] in [#$81..#$A0] then begin
Inc(maybeGB, 8);
end else begin
Inc(maybeGB, 8);
Inc(mayBig5, 8);
end;
Inc(chsCount);
Inc(idx);
iUtf8 := idx;
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$C0..#$DF: begin
if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin
Inc(mayUtf8);
Inc(utf8Count);
Inc(chsCount);
if (buf[idx +1] in [#$A1..#$BF]) then begin
Inc(maybeGB);
Inc(mayBig5);
end else begin
Inc(maybeGB, 4);
end;
end else begin
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
end;
Inc(idx);
end;
#$E0..#$EF: begin
if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
Inc(idx);
end;
#$F0..#$FE: begin
if buf[idx] in [#$FA..#$FE] then begin
Inc(maybeGB, 8);
end;
Inc(maybeGB, 8);
Inc(mayBig5, 8);
Inc(chsCount);
Inc(idx);
end;
end;
Inc(idx);
end;
// set encoding
if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin
if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or
(serZero * 8 > Max(maybeLE, maybeBE)) then begin
Result := ceBinary;
end else if maybeLE >= maybeBE then begin
Result := ceUcs2_LE;
end else begin
Result := ceUcs2_BE;
end;
end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin
ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5));
if Abs(ratio) <= 5 then begin
if gbkNulls > big5Nulls then begin
Result := ceBig5;
end else if gbkNulls < big5Nulls then begin
Result := ceGB;
end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin
Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len));
end;
end else begin
if ratio > 0 then begin
Result := ceGB;
end else begin
Result := ceBig5;
end;
end;
end else if mayUtf8 > 0 then begin
Result := ceUtf_8;
end;
end;

function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding;
var
buf: string;
begin
SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position));
AStream.Read(buf[1], Length(buf));
Result := GuessCharEncoding(buf, SeeGBBig5);
end;

function DoTryToDecode(Source, Target: TStream; SeeGBBig5: Boolean): TCharEncoding;
begin
// test character encoding
Result := GuessCharEncoding(Source, SeeGBBig5);

// transfer encoding
Source.Seek(0, soFromBeginning);
case Result of
ceUtf_8: DoUtf8ToAnsi(Source, Target);
ceUcs2_LE: DoLittleEndianUnicode16ToAnsi(Source, Target);
ceUcs2_BE: DoBigEndianUnicode16ToAnsi(Source, Target);
ceUtf_32: raise Exception.Create('UTF-32 not support yet.');
ceGB: DoGBToAnsi(Source, Target);
ceBig5: DoBig5ToAnsi(Source, Target);
else //ceAnsi, ceBinary
Target.CopyFrom(Source, Source.Size);
end;
end;

{------------------------- Intialize Char Convert ---------------------------}

procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc;
AExactCompare: Boolean; ASamplingSize: Integer);
begin
ExactCompare := AExactCompare;
SamplingSize := ASamplingSize;
if Assigned(AGB2Big5) then begin
GB2Big5Proc := AGB2Big5;
end else begin
GB2Big5Proc := GB2BigEx.Gb2Big5;
end;
if Assigned(ABig52GB) then begin
Big52GBProc := ABig52GB;
end else begin
Big52GBProc := GB2BigEx.Big52GB;
end;
end;

{--------------------------- Smart load text file ---------------------------}

function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream;
TryToDecode, SeeGBBig5: Boolean): TCharEncoding;
var
fs: TFileStream;
bom: array[1..4] of Char;
len: Longint;
begin
Result := ceAnsi;
fs := TFileStream.Create(AFileName, fmOpenRead or fmShareDenyNone);
with fs do try
if Size = 0 then Exit;
len := Read(bom, SizeOf(bom));
if (len >= 3) and (bom[1] = #$EF) and (bom[2] = #$BB) and (bom[3] = #$BF) then begin //UTF8
Result := ceUtf_8;
Seek(3, soFromBeginning);
DoUtf8ToAnsi(fs, AStream);
end else if (len >= 2) and (bom[1] = #$FF) and (bom[2] = #$FE) then begin //UTF-16/UCS-2, little endian
Result := ceUcs2_LE;
Seek(2, soFromBeginning);
DoLittleEndianUnicode16ToAnsi(fs, AStream);
end else if (len >= 2) and (bom[1] = #$FE) and (bom[2] = #$FF) then begin //UTF-16/UCS-2, big endian
Result := ceUcs2_BE;
Seek(2, soFromBeginning);
DoBigEndianUnicode16ToAnsi(fs, AStream);
end else begin
Seek(0, soFromBeginning);
if TryToDecode then begin
Result := DoTryToDecode(fs, AStream, SeeGBBig5);
end else begin
AStream.CopyFrom(fs, fs.Size);
end;
end;
AStream.Position := 0;
finally
fs.Free;
end;
end;

initialization
GB2Big5Proc := GB2BigEx.Gb2Big5;
Big52GBProc := GB2BigEx.Big52GB;

end.

//******************************************************************************
// GB/Big5相互转换代码(直接调用API实现,也可以用其他类似函数代替)

unit GB2BigEX;

interface

{$IFDEF VER150} //D7
{$WARN UNSAFE_CODE OFF}
{$WARN UNSAFE_TYPE OFF}
{$WARN UNSAFE_CAST OFF}
{$ENDIF}

uses
Windows;

function Gb2Big5(const S: string): string;
function Big52Gb(const S: string): string;


implementation

function UnicodeEncode(const S: string; CodePage: Integer): WideString;
var
Len: Integer;
begin
Len := Length(S) + 1;
SetLength(Result, Len);
Len := MultiByteToWideChar(CodePage, 0, PChar(S), -1, PWideChar(Result), Len);
SetLength(Result, Len - 1); //end is #0
end;

function UnicodeDecode(const S: WideString; CodePage: Integer): string;
var
Len: Integer;
begin
Len := Length(S) * 2 + 1; //one for #0
SetLength(Result, Len);
Len := WideCharToMultiByte(CodePage, 0, PWideChar(S), -1, PChar(Result), Len, nil, nil);
SetLength(Result, Len - 1);
end;

function Gb2Big5(const S: string): string;
begin
SetLength(Result, Length(S));
LCMapString(GetUserDefaultLCID, LCMAP_TRADITIONAL_CHINESE,
PChar(S), Length(S),
PChar(Result), Length(Result));
Result := UnicodeDecode(UnicodeEncode(Result, 936), 950); //改成Result := UnicodeEncode(Result, 936);则可在GBK中显示繁体字
end;

function Big52Gb(const S: string): string;
var
tmp: string;
begin
tmp := UnicodeDecode(UnicodeEncode(S, 950), 936);
SetLength(Result, Length(tmp));
LCMapString(GetUserDefaultLCID, LCMAP_SIMPLIFIED_CHINESE,
PChar(tmp), Length(tmp),
PChar(Result), Length(Result));
end;

end.


// 如果是在Delphi5下使用,那还需要一个Unicode.pas

{****************************************************************************}
{ Some Function of Ansi, UTF8, Unicode Converting (copy from Delphi6) }
{****************************************************************************}

unit Unicode;

interface

uses
Classes, Windows, SysUtils;

type
UTF8String = type string;
PUTF8String = ^UTF8String;

{ PChar/PWideChar Unicode <-> UTF8 conversion }

// UnicodeToUTF8(3):
// UTF8ToUnicode(3):
// Scans the source data to find the null terminator, up to MaxBytes
// Dest must have MaxBytes available in Dest.
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer; overload; //deprecated;
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer; overload; //deprecated;

// UnicodeToUtf8(4):
// UTF8ToUnicode(4):
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.
// Nulls in the source data are not considered terminators - SourceChars must be accurate

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal; overload;
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal; overload;

{ WideString <-> UTF8 conversion }

function UTF8Encode(const WS: WideString): UTF8String;
function UTF8Decode(const S: UTF8String): WideString;

{ Ansi <-> UTF8 conversion }

function AnsiToUtf8(const S: string): UTF8String;
function Utf8ToAnsi(const S: UTF8String): string;

implementation

// UnicodeToUTF8(3):
// Scans the source data to find the null terminator, up to MaxBytes
// Dest must have MaxBytes available in Dest.

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer;
var
len: Cardinal;
begin
len := 0;
if Source <> nil then
while Source[len] <> #0 do
Inc(len);
Result := UnicodeToUtf8(Dest, MaxBytes, Source, len);
end;

// UnicodeToUtf8(4):
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.
// Nulls in the source data are not considered terminators - SourceChars must be accurate

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal;
var
i, count: Cardinal;
c: Cardinal;
begin
Result := 0;
if Source = nil then Exit;
count := 0;
i := 0;
if Dest <> nil then
begin
while (i < SourceChars) and (count < MaxDestBytes) do
begin
c := Cardinal(Source[i]);
Inc(i);
if c <= $7F then
begin
Dest[count] := Char(c);
Inc(count);
end
else if c > $7FF then
begin
if count + 3 > MaxDestBytes then
break;
Dest[count] := Char($E0 or (c shr 12));
Dest[count+1] := Char($80 or ((c shr 6) and $3F));
Dest[count+2] := Char($80 or (c and $3F));
Inc(count,3);
end
else // $7F < Source[i] <= $7FF
begin
if count + 2 > MaxDestBytes then
break;
Dest[count] := Char($C0 or (c shr 6));
Dest[count+1] := Char($80 or (c and $3F));
Inc(count,2);
end;
end;
if count >= MaxDestBytes then count := MaxDestBytes-1;
Dest[count] := #0;
end
else
begin
while i < SourceChars do
begin
c := Integer(Source[i]);
Inc(i);
if c > $7F then
begin
if c > $7FF then
Inc(count);
Inc(count);
end;
Inc(count);
end;
end;
Result := count+1; // convert zero based index to byte count
end;

function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer;
var
len: Cardinal;
begin
len := 0;
if Source <> nil then
while Source[len] <> #0 do
Inc(len);
Result := Utf8ToUnicode(Dest, MaxChars, Source, len);
end;

function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal;
var
i, count: Cardinal;
c: Byte;
wc: Cardinal;
begin
if Source = nil then
begin
Result := 0;
Exit;
end;
Result := Cardinal(-1);
count := 0;
i := 0;
if Dest <> nil then
begin
while (i < SourceBytes) and (count < MaxDestChars) do
begin
wc := Cardinal(Source[i]);
Inc(i);
if (wc and $80) <> 0 then
begin
wc := wc and $3F;
if i > SourceBytes then Exit; // incomplete multibyte char
if (wc and $20) <> 0 then
begin
c := Byte(Source[i]);
Inc(i);
if (c and $C0) <> $80 then Exit; // malformed trail byte or out of range char
if i > SourceBytes then Exit; // incomplete multibyte char
wc := (wc shl 6) or (c and $3F);
end;
c := Byte(Source[i]);
Inc(i);
if (c and $C0) <> $80 then Exit; // malformed trail byte

Dest[count] := WideChar((wc shl 6) or (c and $3F));
end
else
Dest[count] := WideChar(wc);
Inc(count);
end;
if count >= MaxDestChars then count := MaxDestChars-1;
Dest[count] := #0;
end
else
begin
while (i <= SourceBytes) do
begin
c := Byte(Source[i]);
Inc(i);
if (c and $80) <> 0 then
begin
if (c and $F0) = $F0 then Exit; // too many bytes for UCS2
if (c and $40) = 0 then Exit; // malformed lead byte
if i > SourceBytes then Exit; // incomplete multibyte char

if (Byte(Source[i]) and $C0) <> $80 then Exit; // malformed trail byte
Inc(i);
if i > SourceBytes then Exit; // incomplete multibyte char
if ((c and $20) <> 0) and ((Byte(Source[i]) and $C0) <> $80) then Exit; // malformed trail byte
Inc(i);
end;
Inc(count);
end;
end;
Result := count+1;
end;

function Utf8Encode(const WS: WideString): UTF8String;
var
L: Integer;
Temp: UTF8String;
begin
Result := '';
if WS = '' then Exit;
SetLength(Temp, Length(WS) * 3); // SetLength includes space for null terminator

L := UnicodeToUtf8(PChar(Temp), Length(Temp)+1, PWideChar(WS), Length(WS));
if L > 0 then
SetLength(Temp, L-1)
else
Temp := '';
Result := Temp;
end;

function Utf8Decode(const S: UTF8String): WideString;
var
L: Integer;
Temp: WideString;
begin
Result := '';
if S = '' then Exit;
SetLength(Temp, Length(S));

L := Utf8ToUnicode(PWideChar(Temp), Length(Temp)+1, PChar(S), Length(S));
if L > 0 then
SetLength(Temp, L-1)
else
Temp := '';
Result := Temp;
end;

function AnsiToUtf8(const S: string): UTF8String;
begin
Result := Utf8Encode(S);
end;

function Utf8ToAnsi(const S: UTF8String): string;
begin
Result := Utf8Decode(S);
end;

end.


// 测试代码:

procedure TForm1.LoadTextFile(const AFileName: string);
const
  cEncodings: array[TCharEncoding] of string = ('Ansi', 'Binary', 'UTF-8',
    'UCS2(Little Endian)', 'UCS2(Big Endian)', 'UTF-32', 'GB', 'Big5');
var
  stream: TStream;
  charEncoding: TCharEncoding;
begin
  stream := TMemoryStream.Create;
  try
    charEncoding := SmartLoadTextFileToStream(AFileName, stream);
    Memo1.Lines.LoadFromStream(stream);
    Caption := Format('%s - %s (%s)', ['Smart Load Text File', AFileName, cEncodings[charEncoding]]);
  finally
    stream.Free;
  end;
end;

// 附:关联文章
// 文本编码的智能识别(续) - C#版本代码
  评论这张
 
阅读(1611)| 评论(7)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2018