Texis Web Script source code for: /texis/site/demos/rsssearch.vs
Note: Click on links to view the documentation for HTML tags which are special to the Vortex compiler.
<SCRIPT LANGUAGE=vortex>

<uses demonav=demonav><!--simple look and feel module-->
<entryfunc=init><!-- init function for every invocation -->

<!-- ====================================================================== -->
<!-- Global initialization done every time the script runs (see entryfunc) -->
<a name=init private>
   <sum "%s" $SERVER_ROOT "/texis"><db=$ret><!-- set the database -->
</a>

<!-- ====================================================================== -->
<!-- primary starting point -->
<a name=main>
   <demolook title="RSS Search"><!-- call site demo boilerplate -->
      <mystyle>
      <h2>Example of RSS feed processing and real-time profiling</h2><p>
      <instructions>
      <p>
      <searchform>
      <if "" neq $feedurl><!-- user entered a feed url -->
         <rex ">>=http=s?://=[\alnum\-.]{3,}" $feedurl />
         <if "" eq $ret>
            <p><b>Please enter an http or https URL.</b></p>
         <else>
            <div class="Results">
            <h2>Results</h2><p>
            <flush>
            <dorss feedurl=$feedurl query=$query><!-- process the feed -->
            </div>
         </if>
      </if>
   </demolook><!-- close site demo boilerplate -->
</a>

<!-- ====================================================================== -->
<!-- output code for styles custom to this demo -->
<a name=mystyle>
   <style type="text/css">
      .Results {
         border-top-style:    solid;
         border-bottom-style: solid;
         border-left-style:   solid;
         border-right-style:  solid;
         border-top-width:    1px;
         border-bottom-width: 1px;
         border-left-width:   1px;
         border-right-width:  1px;
         border-top-color:    black;
         border-bottom-color: black;
         border-left-color:   black;
         border-right-color:  black;
         padding-top:    4px;
         padding-bottom: 4px;
         padding-left:   4px;
         padding-right:  4px;
      }
      .Result {
         background-color: #ddddee;
         padding-top:    0px;
         padding-bottom: 2px;
         padding-left:   4px;
         padding-right:  4px;
      }
      .Hit {
         background: blue;
         color:      white;
      }
   </style>
</a>

<!-- ====================================================================== -->
<!-- output the user input and search form -->
<a name=searchform>
   <form name=rssform method=POST action="$url">
      RSS or Atom Feed URL:<br>
      <input size=60 name="feedurl" value="$feedurl"><br>
      Examples:<br>
      <tt>  http://rss.news.yahoo.com/rss/topstories<br>
      </tt><br>
      Render: <select name=media>
      <$values="html" "text">
      <$displays="full HTML and images" "just plain text">
      <options $values $media $displays>
      </select><br>
      Search for: <input size=20 name="query" value="$query"> <i>Optional</i><br>
      <input type=submit value="Submit">
      <input type=button value="Clear" onclick="document.rssform.feedurl.value=''; document.rssform.query.value=''">
   </form>
</a>

<!-- ====================================================================== -->
<!-- tell the user how to use the form -->
<a name=instructions>
   Enter the URL for an
   <a href="http://en.wikipedia.org/wiki/RSS_(file_format)">RSS</a>
   or
   <a href="http://en.wikipedia.org/wiki/Atom_%28standard%29">Atom</a>
   feed and an optional query. The feed will be queried and its items
   displayed.  If a query is entered, only items matching the query
   will be displayed.
   <p>
   In a larger <a href="/texis/site/tutorial/Profiling.html">profiling</a>
   application one would place queries into
   a table and index it for use with the
   <a href="/site/vortexman/profiler.html"><profiler></a>
   vortex function.
   <p>
   Alternatively one might fetch data from the feed and place it into a
   database and index it for quick retrospective searches. Or a combination
   of the approaches may be used.
</a>

<!-- ====================================================================== -->
<!-- process a feed url and optional query -->
<a name=dorss feedurl query>
   <fetchrss feedurl=$feedurl><!-- get and parse the feed -->
   <if $ret neq "">
      <if "" neq $NewEtag>Etag: $NewEtag<br></if>
      <if "" neq $NewLastMod>Last Modified: $NewLastMod<br></if>
      <parserss feedurl=$feedurl data=$ret><!-- parse the xml into records -->
      <if $loop gt 0><!-- there are some items -->
         <processrss query=$query><!-- display and optionally search items -->
      </if>
   </if>
</a>

<!-- ====================================================================== -->
<!-- download the feed page -->
<a name=fetchrss feedurl>
<local rawrssdata rssdata>
   <urlcp maxpgsize 4000000><!-- limit to 4MB -->
   <fetch urls=$feedurl /><!-- get the page -->
   <$rawrssdata=$ret><!-- remember the raw xml for later -->
   <!-- remove doctype declaration so it doesn't confuse timport xml parse -->
   <sandr "<\!DOCTYPE=[^>]+>=" "" $rawrssdata>
   <$rssdata=$ret>
   <if $rssdata eq ""><!-- page was empty -->
      <div style="background-color: #eeeedd;">
      Got nothing from $feedurl<br>
      </div>
      <return "">
   </if>
   <urlinfo errnum>
   <if $ret neq 0><!-- page had an error -->
      <div style="background-color: #eedddd;">
      Got error $ret (<urlinfo errmsg>$ret)<br>
      <hr>raw data:<p>$rawrssdata
      </div>
      <return "">
   </if>
   <!-- remember a couple useful header values -->
   <urlinfo header ETag><$NewEtag=$ret>
   <urlinfo header Last-Modified><$NewLastMod=$ret>
   <return $rssdata><!-- return the xml for processing -->
</a>

<!-- ====================================================================== -->
<!-- parse the xml RSS or Atom items into records -->
<a name=parserss feedurl data>
<local rssschema found prevputmsg>
   <!-- 3 timport schemas for the 3 major variants (2 RSS, 1 Atom).
        Try them in turn to see which works. When you get some records
        out you know you have the right one and can stop looking. -->
   <$rssschema="
         # rss feed
         xml nohtml
         xmldatasetlevel 2
         field Title       varchar rss/channel/item/title ''
         field Description varchar rss/channel/item/description ''
         field Link        varchar rss/channel/item/link ''
         field PubDate     varchar rss/channel/item/pubDate ''
         field dcdate      varchar rss/channel/item/dc:date ''
         field Author      varchar rss/channel/item/author ''
      "
      "
         # rss feed
         xml nohtml
         xmldatasetlevel 1
         field Title       varchar rdf:RDF/item/title ''
         field Description varchar rdf:RDF/item/description ''
         field Link        varchar rdf:RDF/item/link ''
         field PubDate     varchar rdf:RDF/item/pubDate ''
         field dcdate      varchar rdf:RDF/item/dc:date ''
         field Author      varchar rdf:RDF/item/author ''
      "
      "
         # atom feed
         xml nohtml
         xmldatasetlevel 1
         field Title       varchar feed/entry/title ''
         field Description varchar feed/entry/summary ''
         field Link        varchar feed/entry/link@href ''
         field PubDate     varchar feed/entry/updated ''
         field dcdate      varchar feed/entry/published ''
         field Author      varchar feed/entry/author/name ''
         field Content     varchar feed/entry/content ''
      "
   >
   <vxcp putmsg log off><!-- prevent logging of xml errors from bad sources -->
   <$prevputmsg=$ret><!-- remember old setting for later restoration -->
   <$found=0><!-- did we find a parsable format? -->
   <loop $rssschema><!-- for each schema -->
      <timport $rssschema $data><!-- try it -->
         <parserssitem feedurl=$feedurl><!-- got a record, parse the content -->
      </timport>
      <$found=$loop>
      <if $loop eq 0><!-- try removing potential byte-order mark from xml -->
         <sandr ">>=\xEF\xBB\xBF" "" $data> 
         <timport $rssschema $ret><!-- try same schema again -->
            <parserssitem feedurl=$feedurl><!-- got a record, parse the content -->
         </timport>
         <$found=$loop>
      </if>
      <if $loop neq 0><!-- got some records from this schema -->
         <break><!-- stop looking for schema -->
      </if>
   </loop>
   <vxcp putmsg log $prevputmsg><!-- restore previous logging setting -->
   <if $found eq 0>
      <div style="background-color: #eeeedd;">
      No RSS or Atom items found in feed.<br>
      </div>
   </if>
</a>

<!-- ====================================================================== -->
<!-- parse the internals of an individual RSS or Atom item -->
<a name=parserssitem feedurl>
   <!-- PubDate is preferred over dcdate.
        Description is preferred over Content. -->
   <if "" eq $Description><!-- atom feed -->
      <$Description=$Content>
   </if>
   <urlcp reparentmode abs><!-- rewrite urls to include host -->
   <fetch urls=$feedurl downloaddoc=$Description /><!-- reparse Description as standalone HTML -->
   <if $media eq "text"><!-- get just the text to avoid the HTML overload -->
      <urlinfo text>
   </if>
   <$Description=$ret>
   <if "" eq $PubDate><!-- no PubDate, use dcdate instead -->
      <$PubDate=$dcdate>
   <else>
      <sandr "\." " " $PubDate><$PubDate=$ret><!-- change . to space -->
   </if>
   <sandr "....-..-..=T=..:..:..=.*" "\1 \3" $PubDate><!-- reformat date -->
   <$PubDate=$ret>
</a>

<!-- ====================================================================== -->
<!-- search and display the downloaded and parsed records -->
<a name=processrss query>
   <$skipped=0><!-- keep count of how many records skipped (not displayed) -->
   <loop $Title $Description $Link $PubDate $Author><!-- for each records -->
      <if "" neq $query><!-- there's a query -->
         <!-- combine the fields and query them all together as one -->
         <!-- a fancy app. might have a different query for each field -->
         <sum "%s " $Title $Description $Author>
         <if $ret not like $query><!-- not a match -->
            <$skipped=($skipped+1)><!-- increment skip counter -->
            <continue><!-- go to next record instead of displaying this -->
         </if>
      </if>
      <showrssitem><!-- display this record -->
   </loop>
   $loop items processed.
   <if "" neq $query>
   $skipped items suppressed due to non-match.
   </if>
   <p>
</a>

<!-- ====================================================================== -->
<!-- display an individual RSS or Atom record -->
<a name=showrssitem>
   <div class="Result">
   <!-- in case there are multiple Links, show them all -->
   <split nonempty max=1 '\x0a' $Link>
      <h3><a href="$ret"><showhits data=$Title query=$query></a></h3>
   </split>
   <!-- only show the meta fields if they have data -->
   <if "" neq $PubDate>Publication date: $PubDate<br></if>
   <if "" neq $Author>Author: <showhits data=$Author query=$query><br></if>
   <showhits data=$Description query=$query html=y>
   <div style="clear: both"></div>
   </div>
</a>

<!-- ====================================================================== -->
<!-- display some text or html with matches highlighted -->
<a name=showhits data query html=n>
<local fmt>
   <if $html eq "n"><!-- plain text -->
      <$fmt="%mIhH"><!-- HTML escape HTML tag chars so they appear to user -->
   <else>
      <$fmt="%mIhs"><!-- leave HTML tag chars alone -->
   </if>
   <strfmt $fmt $query $data>
   <!-- replace the standard vortex hit markup with span classes for customization -->
   <sandr ">><a name\=hit=\digit+ href\=#hit=\digit+>=!</a>+</a>"
          "<span class=Hit>\6</span>" $ret>
   <fmt "%!hV" $ret><!-- decode UTF-8, html encoding out-of-range chars -->
</a>

<!-- ====================================================================== -->
</script>

Back to the Code Example List

The Source Viewer is also a Code Example.
Click Here to see its source.
Copyright © 1992-1999 Thunderstone Software
Copyright © 2024 Thunderstone Software LLC. All rights reserved.