Shift_JIS and UTF-8
Hi all,
The Japanese characters ソqソセ are saved in DB table as 縬烤. If you use 縬烤 in a html file, it shows up correctly in a browser using Shift_JIS encoding.
Can anyone figure out what charset 縬烤 are in?
The reason why I have that question is I need to find a way to convert ソqソセ to UTF-8 via 縬烤.
Thanks.
[376 byte] By [
lei.javaa] at [2007-11-27 4:07:49]

# 5
>I did some research but it seems there is no easy way
It it's easy then it wouldn't be fun, would it?
For the duke stars, here is the code you need:
import java.io.*;
/*
* file must be saved as unicode-16 document and compile with "-encoding UTF-16" option
*/
public class charsetCheck {
public static void charsetCheck(String s) {
int i=0,j,k;
String out;
byte[] utfBytes=new byte[1];
try {
utfBytes=s.getBytes("UTF-8");
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
while (i<utfBytes.length) {
k=utfBytes[i]&0xFF;
if (k>=252) {// 6 bytes unicode
// do it yourself
i+=6;
} else if (k>=248) {// 5 bytes unicode
// do it yourself
i+=5;
} else if (k>=240) {// 4 bytes unicode
// do it yourself
i+=4;
} else if (k>=224) {// 3 bytes unicode
if (i+2<=utfBytes.length) {
k=(utfBytes[i]&0xF);
k=k<<12;
j=(utfBytes[i+1]&0x3F);
k+=(j<<6);
k+=(utfBytes[i+2]&0x3F);
out="\\u"+Integer.toHexString(k).toUpperCase()+" &#"+k+"; ";
if (k>=0x0E00 && k<=0x0E7F) System.out.println(out+"Thai");
if (k>=0x2E80 && k<=0x2EFF) System.out.println(out+"CJK Radicals Supplement");
if (k>=0x2F00 && k<=0x2FDF) System.out.println(out+"Kangxi Radicals");
if (k>=0x2FF0 && k<=0x2FFF) System.out.println(out+"Ideographic Description Characters");
if (k>=0x3000 && k<=0x303F) System.out.println(out+"CJK Symbols and Punctuation");
if (k>=0x3040 && k<=0x309F) System.out.println(out+"Hiragana");
if (k>=0x30A0 && k<=0x30FF) System.out.println(out+"Katakana");
if (k>=0x3100 && k<=0x312F) System.out.println(out+"Bopomofo");
if (k>=0x3130 && k<=0x318F) System.out.println(out+"Hangul Compatibility Jamo");
if (k>=0x3190 && k<=0x319F) System.out.println(out+"Kanbun");
if (k>=0x31A0 && k<=0x31BF) System.out.println(out+"Bopomofo Extended");
if (k>=0x31C0 && k<=0x31EF) System.out.println(out+"CJK Strokes");
if (k>=0x31F0 && k<=0x31FF) System.out.println(out+"Katakana Phonetic Extensions");
if (k>=0x3200 && k<=0x32FF) System.out.println(out+"Enclosed CJK Letters and Months");
if (k>=0x3300 && k<=0x33FF) System.out.println(out+"CJK Compatibility");
if (k>=0x3400 && k<=0x4DBF) System.out.println(out+"CJK Unified Ideographs Extension A");
if (k>=0x4DC0 && k<=0x4DFF) System.out.println(out+"Yijing Hexagram Symbols");
if (k>=0x4E00 && k<=0x9FFF) System.out.println(out+"CJK Unified Ideographs");
if (k>=0xA000 && k<=0xA48F) System.out.println(out+"Yi Syllables");
if (k>=0xA490 && k<=0xA4CF) System.out.println(out+"Yi Radicals");
if (k>=0xF900 && k<=0xFAFF) System.out.println(out+"CJK Compatibility Ideographs");
if (k>=0xFE30 && k<=0xFE4F) System.out.println(out+"CJK Compatibility Forms");
if (k>=0xFF00 && k<=0xFFEF) System.out.println(out+"CJK Halfwidth and Fullwidth Forms");
i+=3;
} else break;
} else if (k>=192) {// 2 bytes unicode
// do it yourself
i+=2;
} else {
System.out.println((char)k+" \\u"+Integer.toHexString(k).toUpperCase()+" &#"+k+"; US-ASCII");
i++;
}
}
}
public static void main(String[] args) {
charsetCheck("ฟิลิปปินส์");// Thai
charsetCheck("非律宾");// Chinese
charsetCheck("ソqソセ");
}
}
V.V.