当前位置:网站首页>[Delphi] determine the encoding method of the file (ANSI, Unicode, utf8, unicodebig)

[Delphi] determine the encoding method of the file (ANSI, Unicode, utf8, unicodebig)

2022-06-11 23:03:00 sensor_ WU

In development , It is often encountered that the file encoding format is incorrect , Sometimes code conversion is also required , Specific coding principles can be found by yourself , Here's how I handled it . According to this principle, code conversion and automatic judgment can be realized with a little modification .

{ Determine the encoding method of the document , Code conversion can be realized 
 sensor
 2018-08-02
}
unit uCODE_Convert;

interface
uses
  Winapi.Windows,
  System.SysUtils,
  System.Variants,
  System.Classes;

type
  TCODE_TYPE = (ctANSI,ctUnicode,ctUTF8,ctUnicodeBIG);


// Determine the encoding format of a file 
// Entrance parameters :FieName File name 
// Export parameters : file type 
function Get_FileCode_TYPE(FileName : string) : TCODE_TYPE;

// Judge whether a file is ANSI code , The criteria are   Greater than $80 Bytes of must appear in even numbers ,
// Cannot appear in the content $00  Otherwise it's definitely not , If it's all , It means ANSI code 
// entrance : File byte stream 
// exit :True  Presentation time ANSI code , Otherwise, it's not ANSI code 
function is_ANSI_CODE(M : TMemoryStream) : Boolean;

// Judge whether it is UTF8
function is_UTF8_CODE(BB : TBytes) : Boolean;

function GetEncodingType(code: TCODE_TYPE): Tencoding;

function GetFileEncoding(FileName : string) : Tencoding;

implementation

function GetFileEncoding(FileName : string) : Tencoding;
var
  code: TCODE_TYPE;
begin
  code := Get_FileCode_TYPE(FileName);
  result := GetEncodingType(code);
end;

function GetEncodingType(code: TCODE_TYPE): Tencoding;
begin
  case code of
    ctANSI:
      result := TEncoding.ANSI;
    ctUnicode:
      result := TEncoding.Unicode;
    ctUTF8:
      result := TEncoding.UTF8;
    ctUnicodeBIG:
      result := TEncoding.BigEndianUnicode
    else
      result := TEncoding.ANSI;
  end;
end;

// Determine the encoding format of a file 
function Get_FileCode_TYPE(FileName : string) : TCODE_TYPE;
var
  MF : TMemoryStream;
  MB : TBytes;
  B  : Byte;
  Position,Len,i : Int64;
  D80 : Int64;
  isANSI : Boolean;
begin
  // So let's open the file 
  if not FileExists(FileName) then Exit(ctANSI);
  MF := TMemoryStream.Create;
  MF.LoadFromFile(FileName);
  MF.Position := 0;
  Len := MF.Size;
  SetLength(MB,Len);
  MF.Read(MB[0],Len);       // Read data into memory table 
  MF.Free;
  try
     //1.  First, judge according to the sign  FF FE?
     if (MB[0] = $FF) and (MB[1] = $FE) then Exit(ctUnicode);   //Unicode
     //2.  First, judge according to the sign  FE FF?
     if (MB[0] = $FE) and (MB[1] = $FF) then Exit(ctUnicodeBIG);   //ctUnicodeBIG
     //3.  First, judge according to the sign  EF BB BF?
     if (MB[0] = $EF) and (MB[1] = $BB) and (MB[2] = $BF) then Exit(ctUTF8);   //ctUTF8

     // Let's judge whether it is UTF8
     if is_UTF8_CODE(MB) then exit(ctUTF8);
     // Let's judge whether it is ANSI , And confirm Unicode
     isANSI := True;
     for i := 0 to Len - 1 do
       begin
         B := MB[i];
         if B = 0 then
           if (Len Mod 2) = 0 then  // That is the Unicode , At this point, it is necessary to judge whether it is  Big
            begin
              if i = 0 then Exit(ctUnicodeBIG);   // The first one is 0, And even bytes , You can be sure it's ctUnicodeBIG
              if MB[i - 1] < $80 then
                 Exit(ctUnicode)             // first $00 The front of is less than $80, That is the Unicode,  It is UnicodeBig
              else
                 Exit(ctUnicodeBIG)
            end
           else
            begin
              isANSI := False;
              Break;   // appear 0 character , The length is not even bytes , It is certain that the expression is not ANSI
            end;
       end;

     if isANSI then Exit(ctANSI);

  finally
   // MF.Free;
  end;

end;


function is_ANSI_CODE(M : TMemoryStream) : Boolean;
var
  MB : TBytes;
  B : Byte;
  Position,Len,i : Int64;
  D80 : Int64;
begin
  Result := False;
  Len := M.Size;       // file length 
  M.Position := 0;     // From the first 
  D80 := 0;              // Default $80 Count 
  SetLength(MB,Len);
  M.Read(MB[0],Len);     // Read into memory 
  for i := 0 to Len - 1 do
    begin
      B := MB[i];
      if B = 0 then Exit(False);  // appear 0 character , It means not ANSI
      if B >= $80 then
        D80 := D80 + 1
      else
        if (D80 mod 2) = 0 then
          D80 := 0
        else
          Exit(False);
    end;
  Result := True;
end;


// Judge whether it is UTF8
function is_UTF8_CODE(BB : TBytes) : Boolean;
var
  B : Byte;
  Position,Len,i : Int64;
  D80 : Int64;
begin
  Result := True;
  Len := Length(BB);
  i := 0;
  while (i < Len - 1) do
    begin
      B := BB[i];
      if B < $80 then
        begin
          i := i + 1;
          Continue;
        end;
      if B < $C0 then  // (11000000):  The value is between 0x80 And 0xC0 Between is invalid UTF-8 character 
        begin
          Exit(False);
        end;

      if B < $E0 then  // (11100000):  This range is 2 byte UTF-8 character 
        begin
          if i >= (Len - 1) then  Exit(False);
          if (BB[ i + 1 ] and $C0) <> $80 then Exit(False);
          i := i + 2;
        end;

      if B < $F0 then  // (11110000):  This range is 3 byte UTF-8 character 
        begin
          if i >= (Len - 1 - 1) then  Exit(False);
          if ((BB[ i + 1 ] and $C0) <> $80) and ((BB[ i + 2 ] and $C0) <> $80) then Exit(False);
          i := i + 3;
        end
      else
        Exit(False);
    end;

end;

end.

原网站

版权声明
本文为[sensor_ WU]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/162/202206112302226476.html