Saturday

Read a RSS using StAX for XML Parsing


This entry will introduce how to read a RSS by using StAX.

RSS: stand for Rich Site Summary. RSS document (called feed or web feed) include full or summarized text and metadata like publishing date, author's name,... RSS document is a XML file and is specified via RSS specification.

StAX: stand for Streaming API for XML, is API for XML processing.

* Create a Domain model to represent a Feed.

 Feed Example

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>

<rss xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">  
  <channel> 
    <title>BBC News - Asia</title>  
    <link>http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
    <description>The latest stories from the Asia section of the BBC News web site.</description>  
    <language>en-gb</language>  
    <lastBuildDate>Sat, 23 Nov 2013 08:47:43 GMT</lastBuildDate>  
    <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/2/hi/help/rss/4498287.stm for terms and conditions of reuse.</copyright>  
    <ttl>15</ttl>  
    <atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/>  
    <item> 
      <title>China creates 'air-defence zone'</title>  
      <description>China demarcates an "air-defence identification zone" over the East China Sea, including islands that are also claimed by Japan.</description>  
      <link>http://www.bbc.co.uk/news/world-asia-25062525#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
      <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25062525</guid>  
      <pubDate>Sat, 23 Nov 2013 09:02:26 GMT</pubDate>  
      <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298080_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/>  
      <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71298000/jpg/_71298081_e4739517-4b93-445e-ac3f-e84f805b54aa.jpg"/> 
    </item>  
    <item> 
      <title>Deadly bomb blasts hit Karachi</title>  
      <description>At least seven people are reported to have been killed in two bomb explosions in a predominantly Shia area of Pakistan's southern city of Karachi.</description>  
      <link>http://www.bbc.co.uk/news/world-asia-25058015#sa-ns_mchannel=rss&amp;ns_source=PublicRSS20-sa</link>  
      <guid isPermaLink="false">http://www.bbc.co.uk/news/world-asia-25058015</guid>  
      <pubDate>Fri, 22 Nov 2013 22:02:47 GMT</pubDate>  
      <media:thumbnail width="66" height="49" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297424_71296480.jpg"/>  
      <media:thumbnail width="144" height="81" url="http://news.bbcimg.co.uk/media/images/71297000/jpg/_71297425_71296480.jpg"/> 
    </item>  
 </channel> 
</rss>

Feed item class: represent a item

package jbohn.xml.rss.model;

public class FeedItem 
{
 String title;
 String author;
 String description;
 String link;
 String guid;
 
 public String getTitle() {
  return title;
 }
 public void setTitle(String title) {
  this.title = title;
 }
 public String getAuthor() {
  return author;
 }
 public void setAuthor(String author) {
  this.author = author;
 }
 public String getDescription() {
  return description;
 }
 public void setDescription(String description) {
  this.description = description;
 }
 public String getLink() {
  return link;
 }
 public void setLink(String link) {
  this.link = link;
 }
 public String getGuid() {
  return guid;
 }
 public void setGuid(String guid) {
  this.guid = guid;
 }
 
 public String toString()
 {
  return "FeedMessage [title=" + title + ", description=" + description
          + ", link=" + link + ", author=" + author + ", guid=" + guid
          + "]";
 }
}


Feed class:

package jbohn.xml.rss.model;

import java.util.ArrayList;
import java.util.List;

public class Feed {
 final String title;
 final String link;
 final String description;
 final String language;
 final String copyright;
 final String pubDate;
 final List<FeedItem> entries = new ArrayList<FeedItem>();

 public Feed(String title, String link, String description, String language,
   String copyright, String pubDate) {
  this.title = title;
  this.link = link;
  this.description = description;
  this.language = language;
  this.copyright = copyright;
  this.pubDate = pubDate;
 }
 public List<FeedItem> getMessages() {
  return entries;
 }
 public String getTitle() {
  return title;
 }
 public String getLink() {
  return link;
 }
 public String getDescription() {
  return description;
 }
 public String getLanguage() {
  return language;
 }
 public String getCopyright() {
  return copyright;
 }
 public String getPubDate() {
  return pubDate;
 }
 @Override
 public String toString() {
  return "Feed [copyright=" + copyright + ", description=" + description
    + ", language=" + language + ", link=" + link + ", pubDate="
    + pubDate + ", title=" + title + "]";
 }
} 

RSSFeedParser class

package jbohn.xml.rss.read;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.XMLEvent;

import jbohn.xml.rss.model.Feed;
import jbohn.xml.rss.model.FeedItem;

public class RSSFeedParser 
{
 static final String TITLE = "title";
 static final String DESCRIPTION = "description";
 static final String CHANNEL = "channel";
 static final String LANGUAGE = "language";
 static final String COPYRIGHT = "copyright";
 static final String LINK = "link";
 static final String AUTHOR = "author";
 static final String ITEM = "item";
 static final String PUB_DATE = "pubDate";
 static final String GUID = "guid";
 
 final URL url;
 
 public RSSFeedParser(String feedURL)
 {
  try
  {
   this.url = new URL(feedURL);
  }
  catch(MalformedURLException e)
  {
   throw new RuntimeException(e);
  }
 }
 
 private InputStream read()
 {
  try{
   return url.openStream();
  }
  catch(IOException e)
  {
   throw new RuntimeException(e);
  }
 }
 
 private String getCharacterData(XMLEvent event, XMLEventReader eventReader)
   throws XMLStreamException {
  String result = "";
  event = eventReader.nextEvent();
  if ( event instanceof Characters )
  {
   result = event.asCharacters().getData();
  }
  return result;
 }
 
 public Feed readFeed()
 {
  Feed feed = null;
  try
  {
   boolean isFeedHeader = true;
   // Set header values intial to the empty string
   String description = "";
   String title = "";
   String link = "";
   String language = "";
   String copyright = "";
   String author = "";
   String pubdate = "";
   String guid = "";
   
   XMLInputFactory inputFactory = XMLInputFactory.newInstance();
   InputStream in = read();
   XMLEventReader eventReader = inputFactory.createXMLEventReader(in);
   
   while(eventReader.hasNext())
   {
    XMLEvent event = eventReader.nextEvent();
    if (event.isStartElement())
    {
     String localPart = event.asStartElement().getName().getLocalPart();
     switch (localPart) {
     case ITEM:
      if (isFeedHeader)
      {
       isFeedHeader = false;
       feed = new Feed(title, link, description, language, copyright, pubdate);
      }
      event = eventReader.nextEvent();
      break;
     case TITLE:
      title = getCharacterData(event, eventReader);
      break;
     case DESCRIPTION:
      description = getCharacterData(event, eventReader);
      break;
     case LINK:
      link = getCharacterData(event, eventReader);
      break;
     case GUID:
      guid = getCharacterData(event, eventReader);
      break;
     case LANGUAGE:
      language = getCharacterData(event, eventReader);
      break;
     case AUTHOR:
      author = getCharacterData(event, eventReader);
      break;
     case PUB_DATE:
      pubdate = getCharacterData(event, eventReader);
      break;
     case COPYRIGHT:
      copyright = getCharacterData(event, eventReader);
      break;
     default:
      break;
     }
    }
    else if (event.isEndElement())
    {
     if (ITEM.equals(event.asEndElement().getName().getLocalPart()))
     {
      FeedItem message = new FeedItem();
      message.setAuthor(author);
      message.setDescription(description);
      message.setGuid(guid);
      message.setLink(link);
      message.setTitle(title);
      feed.getMessages().add(message);
      event = eventReader.nextEvent();
      continue;
     }
    }
   }
   
  }
  catch(XMLStreamException e)
  {
   throw new RuntimeException(e);
  }
  return feed;
 }
}


Running:
package jbohn.xml.rss.main;

import jbohn.xml.rss.model.Feed;
import jbohn.xml.rss.model.FeedItem;
import jbohn.xml.rss.read.RSSFeedParser;

public class RSSMain {
 public static void main(String[] args) {
  RSSFeedParser parser = new RSSFeedParser(
    "http://feeds.bbci.co.uk/news/world/asia/rss.xml");
  Feed feed = parser.readFeed();
  System.out.println(feed);
  for (FeedItem message : feed.getMessages()) {
   System.out.println(message);

  }
 }
}



Summarized by jbohn.
Reference: http://www.vogella.com/articles/RSSFeed/article.html#rssoverview

1 comment:

  1. Does this make the difference between rss and atom?

    To me it looks like it would read
    [link]http://www.bbc.co.uk/news/world/asia/#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa[/link]

    and then this link would be overridden by
    [atom:link href="http://feeds.bbci.co.uk/news/world/asia/rss.xml" rel="self" type="application/rss+xml"/]

    How do you handle that?

    ReplyDelete