利用 utf-8 编码判断中文英文字符

判断 unicode 是否是汉字、数字、英文或者其他字符,全角符号转半角符号,unicode 字符串归一化等.

_#!/usr/bin/env python_

_# -*- coding:GBK -*-_

"""汉字处理的工具:

判断unicode是否是汉字,数字,英文,或者其他字符。

全角符号转半角符号。

"""

def is_chinese(uchar):

 """判断一个unicode是否是汉字"""

 if uchar >= u'\u4e00'  and uchar<=u'\u9fa5':

 return  True

 else:

 return  False

def is_number(uchar):

 """判断一个unicode是否是数字"""

 if uchar >= u'\u0030'  and uchar<=u'\u0039':

 return  True

 else:

 return  False

def is_alphabet(uchar):

 """判断一个unicode是否是英文字母"""

 if (uchar >= u'\u0041'  and uchar<=u'\u005a') or (uchar >= u'\u0061'  and uchar<=u'\u007a'):

 return  True

 else:

 return  False

def is_other(uchar):

 """判断是否非汉字,数字和英文字符"""

 if  not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):

 return  True

 else:

 return  False

def B2Q(uchar):

 """半角转全角"""

 inside_code=ord(uchar)

 if inside_code<0x0020  or inside_code>0x7e:

 _#__不是半角字符就返回原来的字符_

 return uchar

 if inside_code==0x0020:

 _#__除了空格其他的全角半角的公式为:半角=全角-0xfee0_

 inside_code=0x3000

 else:

 inside_code+=0xfee0

 return unichr(inside_code)

def Q2B(uchar):

 """全角转半角"""

 inside_code=ord(uchar)

 if inside_code==0x3000:

 inside_code=0x0020

 else:

 inside_code-=0xfee0

 if inside_code<0x0020  or inside_code>0x7e:

 _#__转完之后不是半角字符返回原来的字符_

 return uchar

 return unichr(inside_code)

def stringQ2B(ustring):

 """把字符串全角转半角"""

 return  "".join([Q2B(uchar) for uchar in ustring])

def
uniform(ustring):

 """格式化字符串,完成全角转半角,大写转小写的工作"""

 return stringQ2B(ustring).lower()

def string2List(ustring):

 """将ustring按照中文,字母,数字分开"""

 retList=[]

 utmp=[]

 for uchar in ustring:

 if is_other(uchar):

 if len(utmp)==0:

 continue

 else:

 retList.append("".join(utmp))

 utmp=[]

 else:

 utmp.append(uchar)

 if len(utmp)!=0:

 retList.append("".join(utmp))

 return retList

 _#test Q2B and B2Q_

 for i in range(0x0020,0x007F):

 print Q2B(B2Q(unichr(i))),B2Q(unichr(i))

 _#test uniform_

 ustring=u'中国 人名a高频A'

 ustring=uniform(ustring)

 ret=string2List(ustring)

 print ret

**以下为中英数字判断参考代码**

def is_ch_num_eng(str_pd):

    ch_num_eng='null'

    char_pd=0

    char_pd1=0

    char_pd2=0

    char_pd3=0

    for i in str_pd:

        """判断一个unicode是否是汉字"""

        if i >= u'\u4e00' and i<=u'\u9fa5':

            char_pd1=100

        """判断一个unicode是否是英文字母"""

        if (i >= u'\u0041' and i <=u'\u005a') or (i >= u'\u0061' and i <=u'\u007a'):

            char_pd3=20

        """判断一个unicode是否是数字"""

        if i >= u'\u0030' and i<=u'\u0039':

            char_pd2=3

    char_pd=char_pd1+char_pd2+char_pd3

    if char_pd==3:

        ch_num_eng='num'

    if char_pd==20:

        ch_num_eng='eng'

    if char_pd==100:

        ch_num_eng='ch'

    if char_pd==23:

        ch_num_eng='eng+num'

    if char_pd==103:

        ch_num_eng='ch+num'

    if char_pd==120:

        ch_num_eng='ch+eng'

    if char_pd==123:

        ch_num_eng='all'

    return ch_num_eng