开发者

I have a htm file can i read it as UTF-8 formatted file without doing anything to the file

I have a htm file can i read it as UTF-8 formatted file without doing anything to the file. the file is saved in unicode(not sure) i want to read it as UTF-8 file other wise it gives me some boxes.. this has to be done using java

FileReader loInput = new 开发者_开发知识库FileReader(loFile); 
BufferedReader loBufferReader = new BufferedReader(loInput); 
String loLine; // String that holds current loFile loLine 
int loCount = 0; // Line number of loCount 
loLine = loBufferReader.readLine(); 
loCount++; 
while (loLine != null) { 
    loContent = loContent.concat(loLine); 
    loLine = loBufferReader.readLine(); 
    loCount++; 
} 
loBufferReader.close(); 

i tried this

EDIT: i have to get the data from HTML file and convert it into a DOM object for further processing

I am using

SAXBuilder loSaxBuilder=new SAXBuilder();
            Reader loStringReader=new StringReader(loContent);
            Document loDoc=loSaxBuilder.build(loStringReader);
            XPath loXpath = XPath.newInstance("/Div");
            Element loElement = (Element) loXpath.selectSingleNode(loDoc);

to convert it to dom object


First, Unicode is not an encoding, but a family of encodings, of which UTF-8 is one.

Second, you could use something like

Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));

And then read from in in a loop and append to a StringBuilder or CharBuffer, or write to a StringWriter, but it'd probably be easiest for you to just use IOUtils.toString(InputStream,String) from Apache Commons IO, which i recommend.


You can.

Using a InputStreamReader with the required unicode encoding (eg., UTF-8).

BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileDir), "UTF8"));


Done using :

import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
public class CharsetToolkit{
private byte[] buffer;
private Charset defaultCharset;
private boolean enforce8Bit = false;
public CharsetToolkit(byte[] buffer)
{
    this.buffer = buffer;
    this.defaultCharset = getDefaultSystemCharset();
}
public CharsetToolkit(byte[] buffer, Charset defaultCharset)
{
    this.buffer = buffer;
    setDefaultCharset(defaultCharset);
}
public void setDefaultCharset(Charset defaultCharset)
{
    if (defaultCharset != null)
        this.defaultCharset = defaultCharset;
    else
        this.defaultCharset = getDefaultSystemCharset();
}
public void setEnforce8Bit(boolean enforce)

{

    this.enforce8Bit = enforce;

}

public boolean getEnforce8Bit()

{

    return this.enforce8Bit;

}

public Charset getDefaultCharset()
{
    return defaultCharset;
}

public Charset guessEncoding()
{
    if (hasUTF8Bom(buffer)) return Charset.forName("UTF-8");
    if (hasUTF16LEBom(buffer)) return Charset.forName("UTF-16LE");
if (hasUTF16BEBom(buffer)) return Charset.forName("UTF-16BE");
    boolean highOrderBit = false;
    boolean validU8Char = true;
int length = buffer.length;
    int i = 0;

while (i < length - 6)
    {
        byte b0 = buffer[i];
        byte b1 = buffer[i + 1];
        byte b2 = buffer[i + 2];
        byte b3 = buffer[i + 3];
        byte b4 = buffer[i + 4];
        byte b5 = buffer[i + 5];
        if (b0 < 0)

    {
        highOrderBit = true;
            // a two-bytes sequence was encoutered
            if (isTwoBytesSequence(b0))
            {
                if (!isContinuationChar(b1))
                validU8Char = false;
                else
                    i++;
            }
            // a three-bytes sequence was encoutered

            else if (isThreeBytesSequence(b0))

            {

                // there must be two continuation bytes of the form 10xxxxxx,

                // otherwise the following characteris is not a valid UTF-8 construct

                if (!(isContinuationChar(b1) && isContinuationChar(b2)))

                    validU8Char = false;

                else

                    i += 2;

            }

            // a four-bytes sequence was encoutered

            else if (isFourBytesSequence(b0))

            {

                // there must be three continuation bytes of the form 10xxxxxx,

                // otherwise the following characteris is not a valid UTF-8 construct

                if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))

                    validU8Char = false;

                else

                    i += 3;

            }

            // a five-bytes sequence was encoutered

            else if (isFiveBytesSequence(b0))

            {

                // there must be four continuation bytes of the form 10xxxxxx,

                // otherwise the following characteris is not a valid UTF-8 construct

                if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4)))

                    validU8Char = false;

                else

                    i += 4;

            }

            // a six-bytes sequence was encoutered

            else if (isSixBytesSequence(b0))

            {

                // there must be five continuation bytes of the form 10xxxxxx,

                // otherwise the following characteris is not a valid UTF-8 construct

                if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))

                    validU8Char = false;

                else

                    i += 5;

            }

            else

                validU8Char = false;

        }

        if (!validU8Char) break;

        i++;

    }

    // if no byte with an high order bit set, the encoding is US-ASCII

    // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)

    if (!highOrderBit)

    {

        // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.

        if (this.enforce8Bit)

            return this.defaultCharset;

        else

            return Charset.forName("US-ASCII");

    }

    // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,

    // otherwise the file would not be human readable

    if (validU8Char) return Charset.forName("UTF-8");

    // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding

    return this.defaultCharset;

}



public static Charset guessEncoding(File f, int bufferLength) throws FileNotFoundException, IOException

{

    FileInputStream fis = new FileInputStream(f);

    byte[] buffer = new byte[bufferLength];

    fis.read(buffer);

    fis.close();

    CharsetToolkit toolkit = new CharsetToolkit(buffer);

    toolkit.setDefaultCharset(getDefaultSystemCharset());

    return toolkit.guessEncoding();

}



public static Charset guessEncoding(File f, int bufferLength, Charset defaultCharset) throws FileNotFoundException, IOException

{

    FileInputStream fis = new FileInputStream(f);

    byte[] buffer = new byte[bufferLength];

    fis.read(buffer);

    fis.close();

    CharsetToolkit toolkit = new CharsetToolkit(buffer);

    toolkit.setDefaultCharset(defaultCharset);

    return toolkit.guessEncoding();

}



/**

 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;

 *

 * @param b a byte.

 * @return true if it's a continuation char.

 */

private static boolean isContinuationChar(byte b)

{

    return -128 <= b && b <= -65;

}



/**

 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.

 *

 * @param b a byte.

 * @return true if it's the first byte of a two-bytes sequence.

 */

private static boolean isTwoBytesSequence(byte b)

{

    return -64 <= b && b <= -33;

}



/**

 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.

 *

 * @param b a byte.

 * @return true if it's the first byte of a three-bytes sequence.

 */

private static boolean isThreeBytesSequence(byte b)

{

    return -32 <= b && b <= -17;

}



/**

 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.

 *

 * @param b a byte.

 * @return true if it's the first byte of a four-bytes sequence.

 */

private static boolean isFourBytesSequence(byte b)

{

    return -16 <= b && b <= -9;

}

private static boolean isFiveBytesSequence(byte b) { return -8 <= b && b <= -5; }

private static boolean isSixBytesSequence(byte b)
{
    return -4 <= b && b <= -3;
}
public static Charset getDefaultSystemCharset()
{
    return Charset.forName(System.getProperty("file.encoding"));
}
private static boolean hasUTF8Bom(byte[] bom)
{
    return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);
}
private static boolean hasUTF16LEBom(byte[] bom)
{
return (bom[0] == -1 && bom[1] == -2);
}
private static boolean hasUTF16BEBom(byte[] bom)
{
    return (bom[0] == -2 && bom[1] == -1);
}
public static Charset[] getAvailableCharsets()
{
    Collection collection = Charset.availableCharsets().values();
    return (Charset[]) collection.toArray(new Charset[collection.size()]);
}
public static void main(String[] args) throws FileNotFoundException, IOException
{
File file = new File("windows-1252.txt");
    Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
    System.err.println("Charset found: " + guessedCharset.displayName());
FileInputStream fis = new FileInputStream(file);
    InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
    BufferedReader br = new BufferedReader(isr);
    String line;
    while ((line = br.readLine()) != null)
    {
        System.out.println(line);

    }}}

and by calling this

Charset guessedCharset = CharsetToolkit.guessEncoding(new File("filename.htm"), 4096);
    BufferedReader c = new BufferedReader(new InputStreamReader(new FileInputStream("filename.htm"), guessedCharset)) ;
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜