PHP中GBK和UTF8编码处理
一、编码范围
1. GBK (GB2312/GB18030)
x00-xff GBK双字节编码范围
x20-x7f ASCII
xa1-xff 中文
x80-xff 中文
2. UTF-8 (Unicode)
u4e00-u9fa5 (中文)
x3130-x318F (韩文
xAC00-xD7A3 (韩文)
u0800-u4e00 (日文)
ps: 韩文是大于[u9fa5]的字符
正则例子:
preg_replace("/([x80-xff])/","",$str);
preg_replace("/([u4e00-u9fa5])/","",$str);
二、代码例子
//判断内容里有没有中文-GBK (PHP)
function check_is_chinese ( $ s ) {
return preg_match ( '/[x80-xff]./' , $ s ) ;
}
//获取字符串长度-GBK (PHP)
function gb_strlen ( $ str ) {
$ count = 0 ;
for ( $ i = 0 ; $ i < strlen ( $ str ) ; $ i + + ) {
$ s = substr ( $ str , $ i , 1 ) ;
if ( preg_match ( "/[x80-xff]/" , $ s ) ) + + $ i ;
+ + $ count ;
}
return $ count ;
}
//截取字符串字串-GBK (PHP)
function gb_substr ( $ str , $ len ) {
$ count = 0 ;
for ( $ i = 0 ; $ i < strlen ( $ str ) ; $ i + + ) {
if ( $ count = = $ len ) break ;
if ( preg_match ( "/[x80-xff]/" , substr ( $ str , $ i , 1 ) ) ) + + $ i ;
+ + $ count ;
}
return substr ( $ str , 0 , $ i ) ;
}
//统计字符串长度-UTF8 (PHP)
function utf8_strlen ( $ str ) {
$ count = 0 ;
for ( $ i = 0 ; $ i < strlen ( $ str ) ; $ i + + ) { $ value = ord ( $ str [ $ i ] ) ; if ( $ value > 127 ) {
$ count + + ;
if ( $ value > = 192 & & $ value < = 223 ) $ i + + ; elseif ( $ value > = 224 & & $ value < = 239 ) $ i = $ i + 2 ; elseif ( $ value > = 240 & & $ value < = 247 ) $ i = $ i + 3 ;
else die ( 'Not a UTF-8 compatible string' ) ;
}
$ count + + ;
}
return $ count ;
}
//截取字符串-UTF8(PHP)
function utf8_substr ( $ str , $ position , $ length ) {
$ start_position = strlen ( $ str ) ;
$ start_byte = 0 ;
$ end_position = strlen ( $ str ) ;
$ count = 0 ;
for ( $ i = 0 ; $ i < strlen ( $ str ) ; $ i + + ) { if ( $ count > = $ position & & $ start_position > $ i ) {
$ start_position = $ i ;
$ start_byte = $ count ;
}
if ( ( $ count - $ start_byte ) > = $ length ) {
$ end_position = $ i ;
break ;
}
$ value = ord ( $ str [ $ i ] ) ;
if ( $ value > 127 ) {
$ count + + ;
if ( $ value > = 192 & & $ value < = 223 ) $ i + + ; elseif ( $ value > = 224 & & $ value < = 239 ) $ i = $ i + 2 ; elseif ( $ value > = 240 & & $ value < = 247 ) $ i = $ i + 3 ;
else die ( 'Not a UTF-8 compatible string' ) ;
}
$ count + + ;
}
return ( substr ( $ str , $ start_position , $ end_position - $ start_position ) ) ;
}
//字符串长度统计-UTF8 [中文3个字节,俄文、韩文占2个字节,字母占1个字节] (Ruby)
def utf8_string_length ( str )
temp = CGI : : unescape ( str )
i = 0 ;
j = 0 ;
temp . length . times { | t |
if temp [ t ] < 127 i + = 1 elseif temp [ t ] > = 127 and temp [ t ] < 224
j + = 1
if 0 = = ( j % 2 )
i + = 2
j = 0
end
else
j + = 1
if 0 = = ( j % 3 )
i + = 2
j = 0
end
end
}
return i
}
//判断是否是有韩文-UTF-8 (JavaScript)
function checkKoreaChar ( str ) {
for ( i = 0 ; i < str . length ; i + + ) { if ( ( ( str . charCodeAt ( i ) > 0x3130 & & str . charCodeAt ( i ) < 0x318F ) | | ( str . charCodeAt ( i ) > = 0xAC00 & & str . charCodeAt ( i ) < = 0xD7A3 ) ) ) {
return true ;
}
}
return false