Tag Archive for utf-8

50 States & Washington D.C.

CREATE TABLE `state` (
`id` tinyint(4) NOT NULL auto_increment,
`name` char(50) NOT NULL default '',
`abbreviation` char(2) NOT NULL default '',
PRIMARY KEY  (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=52 DEFAULT CHARSET=utf8;

INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('1','Alabama','AL');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('2','Alaska','AK');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('3','Arizona','AZ');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('4','Arkansas','AR');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('5','California','CA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('6','Colorado','CO');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('7','Connecticut','CT');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('8','Delaware','DE');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('9','District of Columbia','DC');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('10','Florida','FL');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('11','Georgia','GA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('12','Hawaii','HI');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('13','Idaho','ID');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('14','Illinois','IL');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('15','Indiana','IN');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('16','Iowa','IA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('17','Kansas','KS');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('18','Kentucky','KY');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('19','Louisiana','LA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('20','Maine','ME');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('21','Maryland','MD');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('22','Massachusetts','MA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('23','Michigan','MI');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('24','Minnesota','MN');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('25','Mississippi','MS');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('26','Missouri','MO');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('27','Montana','MT');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('28','Nebraska','NE');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('29','Nevada','NV');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('30','New Hampshire','NH');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('31','New Jersey','NJ');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('32','New Mexico','NM');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('33','New York','NY');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('34','North Carolina','NC');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('35','North Dakota','ND');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('36','Ohio','OH');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('37','Oklahoma','OK');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('38','Oregon','OR');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('39','Pennsylvania','PA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('40','Rhode Island','RI');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('41','South Carolina','SC');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('42','South Dakota','SD');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('43','Tennessee','TN');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('44','Texas','TX');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('45','Utah','UT');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('46','Vermont','VT');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('47','Virginia','VA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('48','Washington','WA');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('49','West Virginia','WV');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('50','Wisconsin','WI');
INSERT INTO `state` (`id`,`name`,`abbreviation`) VALUES ('51','Wyoming','WY');

source

Convert text ISO to UTF-8

iconv --from-code=ISO-8859-1 --to-code=UTF-8 iso.txt > utf.txt

source

Cut a Long String to a moderate Display String in UTF-8 chracter

//擷取字串前幾個字並避免截掉半個中文字,$strlen要擷取的字串長度(以英文字母數計算,中文字需算二個字數)
//此處直接傳入從資料庫讀出之UTF-8編碼字串
function CuttingStr($str, $strlen) {
//把' '先轉成空白
$str = str_replace(' ', ' ', $str);

$output_str_len = 0; //累計要輸出的擷取字串長度
$output_str = ''; //要輸出的擷取字串

//逐一讀出原始字串每一個字元
for($i=0; $i<strlen($str);$i++){
//擷取字數已達到要擷取的字串長度,跳出回圈
if($output_str_len >= $strlen){
break;
}

//取得目前字元的ASCII碼
$str_bit = ord(substr($str, $i, 1));

if($str_bit < 128) {
//ASCII碼小於 128 為英文或數字字符
$output_str_len += 1; //累計要輸出的擷取字串長度,英文字母算一個字數
$output_str .= substr($str, $i, 1); //要輸出的擷取字串

}elseif($str_bit > 191 && $str_bit < 224) {
//第一字節為落於192~223的utf8的中文字(表示該中文為由2個字節所組成utf8中文字)
$output_str_len += 2; //累計要輸出的擷取字串長度,中文字需算二個字數
$output_str .= substr($str, $i, 2); //要輸出的擷取字串
$i++;

}elseif($str_bit > 223 && $str_bit < 240) {
//第一字節為落於223~239的utf8的中文字(表示該中文為由3個字節所組成的utf8中文字)
$output_str_len += 2; //累計要輸出的擷取字串長度,中文字需算二個字數
$output_str .= substr($str, $i, 3); //要輸出的擷取字串
$i+=2;

}elseif($str_bit > 239 && $str_bit < 248) {
//第一字節為落於240~247的utf8的中文字(表示該中文為由4個字節所組成的utf8中文字)
$output_str_len += 2; //累計要輸出的擷取字串長度,中文字需算二個字數
$output_str .= substr($str, $i, 4); //要輸出的擷取字串
$i+=3;
}
}

//要輸出的擷取字串為空白時,輸出原始字串
return ($output_str == '') ? $str : $output_str;
}

source

Convert Postgres ascii dump to UTF-8

iconv -c -f WINDOWS-1252 -t UTF-8 ascii.sql > utf8.sql

source

PHP 文字コード判定

function detect_encoding_ja( $str )
{
$enc = @mb_detect_encoding( $str, 'ASCII,JIS,eucJP-win,SJIS-win,UTF-8' );

switch ( $enc ) {
case FALSE   :
case 'ASCII' :
case 'JIS'   :
case 'UTF-8' : break;
case 'eucJP-win' :
// ここで eucJP-win を検出した場合、eucJP-win として判定
if ( @mb_detect_encoding( $str, 'SJIS-win,UTF-8,eucJP-win' ) === 'eucJP-win' ) {
break;
}
$_hint = "¥xbf¥xfd" . $str; // "¥xbf¥xfd" : EUC-JP "雀"

// EUC-JP -> UTF-8 変換時にマッピングが変更される文字を削除( ≒ ≡ ∫ など)
mb_regex_encoding( 'EUC-JP' );
$_hint = mb_ereg_replace( "Â¥xad(?:Â¥xe2|Â¥xf5|Â¥xf6|Â¥xf7|Â¥xfa|Â¥xfb|Â¥xfc|Â¥xf0|Â¥xf1|Â¥xf2)"

$_tmp  = mb_convert_encoding( $_hint, 'UTF-8', 'eucJP-win' );
$_tmp2 = mb_convert_encoding( $_tmp,  'eucJP-win', 'UTF-8' );
if ( $_tmp2 === $_hint ) {

// 例外処理( EUC-JP 以外と認識する範囲 )
if (
// SJIS と重なる範囲(2バイト|3バイト|iモード絵文字|1バイト文字)
! preg_match( '/^(?:'
. '[Â¥x8EÂ¥xE0-Â¥xE9][Â¥x80-Â¥xFC]|Â¥xEA[Â¥x80-Â¥xA4]|'
. 'Â¥x8F[Â¥xB0-Â¥xEF][Â¥xE0-Â¥xEF][Â¥x40-Â¥x7F]|'
. 'Â¥xF8[Â¥x9F-Â¥xFC]|Â¥xF9[Â¥x40-Â¥x49Â¥x50-Â¥x52Â¥x55-Â¥x57Â¥x5B-Â¥x5EÂ¥x72-Â¥x7EÂ¥x80
. '[Â¥x00-Â¥x7E]'
. ')+$/', $str ) &&

// UTF-8 と重なる範囲(全角英数字|漢字|1バイト文字)
! preg_match( '/^(?:'
. 'Â¥xEFÂ¥xBC[Â¥xA1-Â¥xBA]|[Â¥x00-Â¥x7E]|'
. '[Â¥xE4-Â¥xE9][Â¥x8E-Â¥x8FÂ¥xA1-Â¥xBF][Â¥x8FÂ¥xA0-Â¥xEF]|'
. '[Â¥x00-Â¥x7E]'
. ')+$/', $str )
) {
// 条件式の範囲に入らなかった場合は、eucJP-win として検出
break;
}
// 例外処理2(一部の頻度の多そうな熟語は eucJP-win として判定)
// (珈琲|琥珀|瑪瑙|癇癪|碼碯|耄碌|膀胱|蒟蒻|薔薇|蜻蛉)
if ( mb_ereg( '^(?:'
. 'Â¥xE0Â¥xDDÂ¥xE0Â¥xEA|Â¥xE0Â¥xE8Â¥xE0Â¥xE1|Â¥xE0Â¥xF5Â¥xE0Â¥xEF|Â¥xE1Â¥xF2Â¥xE1Â¥xFB|'
. 'Â¥xE2Â¥xFBÂ¥xE2Â¥xF5|Â¥xE6Â¥xCEÂ¥xE2Â¥xF1|Â¥xE7Â¥xAFÂ¥xE6Â¥xF9|Â¥xE8Â¥xE7Â¥xE8Â¥xEA|'
. 'Â¥xE9Â¥xACÂ¥xE9Â¥xAF|Â¥xE9Â¥xF1Â¥xE9Â¥xD9|[Â¥x00-Â¥x7E]'
. ')+$', $str )
) {
break;
}
}

default :
// ここで SJIS-win と判断された場合は、文字コードは SJIS-win として判定
$enc = @mb_detect_encoding( $str, 'UTF-8,SJIS-win' );
if ( $enc === 'SJIS-win' ) {
break;
}
// デフォルトとして SJIS-win を設定
$enc   = 'SJIS-win';

$_hint = "¥xe9¥x9b¥x80" . $str; // "¥xe9¥x9b¥x80" : UTF-8 "雀"

// 変換時にマッピングが変更される文字を調整
mb_regex_encoding( 'UTF-8' );
$_hint = mb_ereg_replace( "¥xe3¥x80¥x9c", "¥xef¥xbd¥x9e", $_hint );
$_hint = mb_ereg_replace( "Â¥xe2Â¥x88Â¥x92", "Â¥xe3Â¥x83Â¥xbc", $_hint );
$_hint = mb_ereg_replace( "Â¥xe2Â¥x80Â¥x96", "Â¥xe2Â¥x88Â¥xa5", $_hint );

$_tmp  = mb_convert_encoding( $_hint, 'SJIS-win', 'UTF-8' );
$_tmp2 = mb_convert_encoding( $_tmp,  'UTF-8', 'SJIS-win' );

if ( $_tmp2 === $_hint ) {
$enc = 'UTF-8';
}
// UTF-8 と SJIS 2文字が重なる範囲への対処(SJIS を優先)
if ( preg_match( '/^(?:[Â¥xE4-Â¥xE9][Â¥x80-Â¥xBF][Â¥x80-Â¥x9F][Â¥x00-Â¥x7F])+/', $str ) ) {
$enc = 'SJIS-win';
}
}
return $enc;
}

source

iconv conversion

require 'iconv'
module PermalinkFu
class << self
attr_accessor :translation_to
attr_accessor :translation_from

def escape(str)
s = Iconv.iconv(translation_to, translation_from, str).to_s
s.gsub!(/W+/, ' ') # all non-word chars to spaces
s.strip!            # ohh la la
s.downcase!         #
s.gsub!(/ +/, '-') # spaces to dashes, preferred separator char everywhere
s
end
end

def has_permalink(attr_name, permalink_field = nil)
permalink_field ||= 'permalink'
after_validation { |record| record.send("#{permalink_field}=", PermalinkFu.escape(record.send(attr_name).to_s)) if record.send(permalink_field).to_s.empty? }
end
end

PermalinkFu.translation_to   = 'ascii//ignore//translit'
PermalinkFu.translation_from = 'utf-8'

source