Retrieving HTML encoded text from XML using SAXParser
This is my first time using SAXParser, (I'm using it in Android, but I don't think that makes a difference for this particular issue) and I'm trying to read in data from an RSS feed. So far, it's working great for me for the most part, but I'm having trouble when it gets to a tag that contains HTML encoded text (e.g. <a href="http://...
). The characters()
method only reads in the <
as a <
, then treats the next set of characters as a separate entity, rather than taking the entire contents at once. I would rather it just read it in as it is, without actually translating the HTML. The code I'm using for my document handler (shortened) is posted below:
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
if (localName.equalsIgnoreCase("channel")) {
inChannel = true;
}
if (inChannel) {
if (newFeed == null) newFeed = new Feed();
if (localName.equalsIgnoreCase("image")) {
if (feedImage == null) feedImage = new Image();
inImage = true;
}
if (localName.equalsIgnoreCase("item")) {
if (newItem == null) newItem = new Item();
if (itemList == null) itemList = new ArrayList<Item>();
inItem = true;
}
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if(!inItem) {
if(!inImage) {
if(inChannel) {
//Reached end of feed
if(localName.equalsIgnoreCase("channel")) {
newFeed.setItems((ArrayList<Item>)itemList);
finalFeed = newFeed;
newFeed = null;
inChannel = false;
return;
} else if(localName.equalsIgnoreCase("title")) {
newFeed.setTitle(currentValue); return;
} else if(localName.equalsIgnoreCase("link")) {
newFeed.setLink(currentValue); return;
} else if(localName.equalsIgnoreCase("description")) {
newFeed.setDescription(currentValue); return;
} else if(localName.equalsIgnoreCase("language")) {
newFeed.setLanguage(currentValue); return;
} else if(localName.equalsIgnoreCase("copyright")) {
newFeed.setCopyright(currentValue); return;
} else if(localName.equalsIgnoreCase("category")) {
newFeed.addCategory(currentValue); return;
}
}
}
else { //is inImage
//finished with feed image
if(localName.equalsIgnoreCase("image")) {
newFeed.setImage(feedImage);
feedImage = null;
inImage = false;
return;
} else if (localName.equalsIgnoreCase("url")) {
feedImage.setUrl(currentValue); return;
} else if (localName.equalsIgnoreCase("title")) {
feedImage.setTitle(currentValue); return;
} else if (localName.equalsIgnoreCase("link")) {
feedImage.setLink(currentValue); return;
}
}
}
else { //is inItem
//finished with news item
if (localName.equalsIgnoreCase("item")) {
itemList.add(newItem);
newItem = null;
inItem = false;
return;
} else if (localName.equalsIgnoreCase("title")) {
newItem.setTitle(currentValue); return;
} else if (localName.equalsIgnoreCase("link")) {
newItem.setLink(currentValue); return;
} else if (localName.equalsIgnoreCase("description")) {
newItem.setDescription(currentValue); return;
} else if (localName.equalsIgnoreCase("author")) {
newItem.setAuthor(currentValue); return;
} else if (localName.equalsIgnoreCase("category")) {
newItem.addCategory(currentValue); return;
} else if (localName.equalsIgnoreCase("comments")) {
newItem.setComments(currentValue); return;
} /*else if (localName.equalsIgnoreCase("enclosure")) {
To be implemented later
}*/ else if (localName.equalsIgnoreCase("guid")) {
newItem.setGuid(currentValue); return;
} else if (localName.equalsIgnoreCase("pubDate")) {
newItem.setPubDate(currentValue); return;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
currentValue = new String(ch, start, length);
}
开发者_如何转开发And an example of the RSS feed I'm trying to parse is this one.
Any ideas?
Wonderful. This solution confused me a little, and I couldn't obtain a value for localName like you have, but I was still able to get StringBuilder approach to work.
I didn't replace in the method:
public void characters(char[] ch, int start, int length) throws SAXException {
tempVal = new String(ch,start,length);
But instead added the following line to the method:
tempSB = tempSB.append(new String(ch, start, length));
Where tempSB is a StringBuilder object. That meant I didn't need to alter my entire parser, and could simply switch to reading the SB when it was necessary. When I came to an element that contained html, in startElement, I used:
tempSB.delete(0, tempSB.length());
And in endElement I used:
tempText.setText(tempSB.toString()) ;
Simple as that. No complex boolean system required in my case, and no need to access localName, which is a concept that eludes me. I seem to do just fine accessing qName.
Thanks very much kcoppock for posting the solution you found. I've been looking for hours and this is the only article I could find concise and clear enough to help. The task I'm working on is really urgent, and I would have failed without your help.
In case it helps anyone, I was able to solve this issue by using a boolean for every field in which I'm interested in the data. Then I just continued to append to a StringBuilder until I reached a closing tag, after which I took the StringBuilder value, then emptied it, and set my boolean to false.
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
sb.delete(0, sb.length());
if (localName.equalsIgnoreCase("channel")) {
inChannel = true;
newFeed = new Feed();
itemList = new ArrayList<Item>();
}
if (inChannel) {
if (localName.equalsIgnoreCase("image")) {
feedImage = new Image();
inImage = true;
return;
}
else if (localName.equalsIgnoreCase("item")) {
newItem = new Item();
inItem = true;
return;
}
if(inImage) { //set booleans for image elements
if (localName.equalsIgnoreCase("title")) imgTitle = true;
else if (localName.equalsIgnoreCase("link")) imgLink = true;
else if (localName.equalsIgnoreCase("url")) imgURL = true;
return;
}
else if(inItem) { //set booleans for item elements
if (localName.equalsIgnoreCase("title")) iTitle = true;
else if (localName.equalsIgnoreCase("link")) iLink = true;
else if (localName.equalsIgnoreCase("description")) iDescription = true;
else if (localName.equalsIgnoreCase("author")) iAuthor = true;
else if (localName.equalsIgnoreCase("category")) iCategory = true;
else if (localName.equalsIgnoreCase("comments")) iComments = true;
else if (localName.equalsIgnoreCase("guid")) iGuid = true;
else if (localName.equalsIgnoreCase("pubdate")) iPubDate= true;
else if (localName.equalsIgnoreCase("source")) iSource = true;
return;
} else { //set booleans for channel elements
if (localName.equalsIgnoreCase("title")) fTitle = true;
else if (localName.equalsIgnoreCase("link")) fLink = true;
else if (localName.equalsIgnoreCase("description")) fDescription = true;
else if (localName.equalsIgnoreCase("language")) fLanguage= true;
else if (localName.equalsIgnoreCase("copyright")) fCopyright = true;
else if (localName.equalsIgnoreCase("category")) fCategory = true;
return;
}
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if(inChannel) {
if(inImage) {
if (localName.equalsIgnoreCase("title")) {
feedImage.setTitle(sb.toString());
sb.delete(0, sb.length());
imgTitle = false;
return;
}
else if (localName.equalsIgnoreCase("link")) {
feedImage.setLink(sb.toString());
sb.delete(0, sb.length());
imgLink = false;
return;
}
else if (localName.equalsIgnoreCase("url")) {
feedImage.setUrl(sb.toString());
sb.delete(0, sb.length());
imgURL = false;
return;
}
else return;
}
else if(inItem) {
if (localName.equalsIgnoreCase("item")) {
itemList.add(newItem);
newItem = null;
inItem = false;
return;
} else if (localName.equalsIgnoreCase("title")) {
newItem.setTitle(sb.toString());
sb.delete(0, sb.length());
iTitle = false;
return;
} else if (localName.equalsIgnoreCase("link")) {
newItem.setLink(sb.toString());
sb.delete(0, sb.length());
iLink = false;
return;
} else if (localName.equalsIgnoreCase("description")) {
newItem.setDescription(sb.toString());
sb.delete(0, sb.length());
iDescription = false;
return;
} else if (localName.equalsIgnoreCase("author")) {
newItem.setAuthor(sb.toString());
sb.delete(0, sb.length());
iAuthor = false;
return;
} else if (localName.equalsIgnoreCase("category")) {
newItem.addCategory(sb.toString());
sb.delete(0, sb.length());
iCategory = false;
return;
} else if (localName.equalsIgnoreCase("comments")) {
newItem.setComments(sb.toString());
sb.delete(0, sb.length());
iComments = false;
return;
} /*else if (localName.equalsIgnoreCase("enclosure")) {
To be implemented later
}*/ else if (localName.equalsIgnoreCase("guid")) {
newItem.setGuid(sb.toString());
sb.delete(0, sb.length());
iGuid = false;
return;
} else if (localName.equalsIgnoreCase("pubDate")) {
newItem.setPubDate(sb.toString());
sb.delete(0, sb.length());
iPubDate = false;
return;
}
}
else {
if(localName.equalsIgnoreCase("channel")) {
newFeed.setItems((ArrayList<Item>)itemList);
finalFeed = newFeed;
newFeed = null;
inChannel = false;
return;
} else if(localName.equalsIgnoreCase("title")) {
newFeed.setTitle(currentValue);
sb.delete(0, sb.length());
fTitle = false;
return;
} else if(localName.equalsIgnoreCase("link")) {
newFeed.setLink(currentValue);
sb.delete(0, sb.length());
fLink = false;
return;
} else if(localName.equalsIgnoreCase("description")) {
newFeed.setDescription(sb.toString());
sb.delete(0, sb.length());
fDescription = false;
return;
} else if(localName.equalsIgnoreCase("language")) {
newFeed.setLanguage(currentValue);
sb.delete(0, sb.length());
fLanguage = false;
return;
} else if(localName.equalsIgnoreCase("copyright")) {
newFeed.setCopyright(currentValue);
sb.delete(0, sb.length());
fCopyright = false;
return;
} else if(localName.equalsIgnoreCase("category")) {
newFeed.addCategory(currentValue);
sb.delete(0, sb.length());
fCategory = false;
return;
}
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
sb.append(new String(ch, start, length));
}
Special characters like that are enclosed in CDATA tags. You need to see that they are preserved , SAX Parser can then deal with them correctly.
精彩评论