Read a RSS using StAX for XML Parsing
This entry will introduce how to read a RSS by using StAX.
RSS: stand for Rich Site Summary. RSS document (called feed or web feed) include full or summarized text and metadata like publishing date, author's name,... RSS document is a XML file and is specified via RSS specification.
StAX: stand for Streaming API for XML, is API for XML processing.
* Create a Domain model to represent a Feed.
Feed Example
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?> <rss xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <channel> <title>BBC News - Asia</title> <link>http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa</link> <description>The latest stories from the Asia section of the BBC News web site.</description> <language>en-gb</language> <lastBuildDate>Sat, 23 Nov 2013 08:47:43 GMT</lastBuildDate> <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/2/hi/help/rss/4498287.stm for terms and conditions of reuse.</copyright> <ttl>15</ttl> <atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/> <item> <title>China creates 'air-defence zone'</title> <description>China demarcates an "air-defence identification zone" over the East China Sea, including islands that are also claimed by Japan.</description> <link>http://www.bbc.co.uk/news/world-asia-25062525#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa</link> <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25062525</guid> <pubDate>Sat, 23 Nov 2013 09:02:26 GMT</pubDate> <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298080_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/> <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298081_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/> </item> <item> <title>Deadly bomb blasts hit Karachi</title> <description>At least seven people are reported to have been killed in two bomb explosions in a predominantly Shia area of Pakistan's southern city of Karachi.</description> <link>http://www.bbc.co.uk/news/world-asia-25058015#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa</link> <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25058015</guid> <pubDate>Fri, 22 Nov 2013 22:02:47 GMT</pubDate> <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297424_71296480.jpg"/> <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297425_71296480.jpg"/> </item> </channel> </rss>
Feed item class: represent a item
package jbohn.xml.rss.model; public class FeedItem { String title; String author; String description; String link; String guid; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getLink() { return link; } public void setLink(String link) { this.link = link; } public String getGuid() { return guid; } public void setGuid(String guid) { this.guid = guid; } public String toString() { return "FeedMessage [title=" + title + ", description=" + description + ", link=" + link + ", author=" + author + ", guid=" + guid + "]"; } }
Feed class:
package jbohn.xml.rss.model; import java.util.ArrayList; import java.util.List; public class Feed { final String title; final String link; final String description; final String language; final String copyright; final String pubDate; final List<FeedItem> entries = new ArrayList<FeedItem>(); public Feed(String title, String link, String description, String language, String copyright, String pubDate) { this.title = title; this.link = link; this.description = description; this.language = language; this.copyright = copyright; this.pubDate = pubDate; } public List<FeedItem> getMessages() { return entries; } public String getTitle() { return title; } public String getLink() { return link; } public String getDescription() { return description; } public String getLanguage() { return language; } public String getCopyright() { return copyright; } public String getPubDate() { return pubDate; } @Override public String toString() { return "Feed [copyright=" + copyright + ", description=" + description + ", language=" + language + ", link=" + link + ", pubDate=" + pubDate + ", title=" + title + "]"; } }
RSSFeedParser class
package jbohn.xml.rss.read; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Characters; import javax.xml.stream.events.XMLEvent; import jbohn.xml.rss.model.Feed; import jbohn.xml.rss.model.FeedItem; public class RSSFeedParser { static final String TITLE = "title"; static final String DESCRIPTION = "description"; static final String CHANNEL = "channel"; static final String LANGUAGE = "language"; static final String COPYRIGHT = "copyright"; static final String LINK = "link"; static final String AUTHOR = "author"; static final String ITEM = "item"; static final String PUB_DATE = "pubDate"; static final String GUID = "guid"; final URL url; public RSSFeedParser(String feedURL) { try { this.url = new URL(feedURL); } catch(MalformedURLException e) { throw new RuntimeException(e); } } private InputStream read() { try{ return url.openStream(); } catch(IOException e) { throw new RuntimeException(e); } } private String getCharacterData(XMLEvent event, XMLEventReader eventReader) throws XMLStreamException { String result = ""; event = eventReader.nextEvent(); if ( event instanceof Characters ) { result = event.asCharacters().getData(); } return result; } public Feed readFeed() { Feed feed = null; try { boolean isFeedHeader = true; // Set header values intial to the empty string String description = ""; String title = ""; String link = ""; String language = ""; String copyright = ""; String author = ""; String pubdate = ""; String guid = ""; XMLInputFactory inputFactory = XMLInputFactory.newInstance(); InputStream in = read(); XMLEventReader eventReader = inputFactory.createXMLEventReader(in); while(eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event.isStartElement()) { String localPart = event.asStartElement().getName().getLocalPart(); switch (localPart) { case ITEM: if (isFeedHeader) { isFeedHeader = false; feed = new Feed(title, link, description, language, copyright, pubdate); } event = eventReader.nextEvent(); break; case TITLE: title = getCharacterData(event, eventReader); break; case DESCRIPTION: description = getCharacterData(event, eventReader); break; case LINK: link = getCharacterData(event, eventReader); break; case GUID: guid = getCharacterData(event, eventReader); break; case LANGUAGE: language = getCharacterData(event, eventReader); break; case AUTHOR: author = getCharacterData(event, eventReader); break; case PUB_DATE: pubdate = getCharacterData(event, eventReader); break; case COPYRIGHT: copyright = getCharacterData(event, eventReader); break; default: break; } } else if (event.isEndElement()) { if (ITEM.equals(event.asEndElement().getName().getLocalPart())) { FeedItem message = new FeedItem(); message.setAuthor(author); message.setDescription(description); message.setGuid(guid); message.setLink(link); message.setTitle(title); feed.getMessages().add(message); event = eventReader.nextEvent(); continue; } } } } catch(XMLStreamException e) { throw new RuntimeException(e); } return feed; } }
Running:
package jbohn.xml.rss.main; import jbohn.xml.rss.model.Feed; import jbohn.xml.rss.model.FeedItem; import jbohn.xml.rss.read.RSSFeedParser; public class RSSMain { public static void main(String[] args) { RSSFeedParser parser = new RSSFeedParser( "http://feeds.bbci.co.uk/news/world/asia/rss.xml"); Feed feed = parser.readFeed(); System.out.println(feed); for (FeedItem message : feed.getMessages()) { System.out.println(message); } } }
Summarized by jbohn.
Reference: http://www.vogella.com/articles/RSSFeed/article.html#rssoverview
Does this make the difference between rss and atom?
ReplyDeleteTo me it looks like it would read
[link]http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa[/link]
and then this link would be overridden by
[atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/]
How do you handle that?