Texis Web Script source code for: https:///texisdocs.thunderstone.com
Note: Click on links to view the documentation for HTML tags which are special to the Vortex compiler.
<script language=vortex>
<timeout=35></timeout>

<uses demonav=demonav>

<!----------------------------------------------------------------------------
Install Notes: 

You must either have Texis 2.5+ or Webinator 2.5+ installed to run this.
Webinator may be downloaded from http://www.thunderstone.com/texis/site/pages/webinator.html
 
1: Save this file under the filename "meta" (no extension) in your
~htdocs/webinator directory and point your browser at
http://www.myserver.com/cgi-bin/texis/webinator/meta/

2: In the <main> function there's a tag that says <REMOVE_ME>. Do it.

3: If a Search engine changes its display format or you want to add
a new engine, you'll need to edit the variables in the <init> function.

4: There's a helper app at  http://www.thunderstone.com/texis/site/demos/metaparse/
that will aid in writing new parse expressions.


------------------------------------------------------------------------------>

<!----------- The main entry point of this script  --------------------------->

<a name=main PUBLIC>
<demolook title="Meta Search">
  <REMOVE_ME>  
  <init>
  <searchform>
  <if $q neq "">
    <netsearch>
  </if>
  <hr>
  <caveat>
</demolook>
</a>

<!----------- Do the Metasearch and show results  ---------------------------->

<a name=netsearch>
<strfmt "%U" $q>                          <!-- URL-escape query -->
<sandr "[\?#\{\}\+\\]" "\\\1" $ret>      <!-- escape sandr replace chars -->
<sandr "xyzzy" $ret $searchurl>           <!-- put the query in the URLs -->
<$fetchthis=$ret>
<$liveurls = $ret>
<$acturls = >
<$actbases = >
<$actimports = >
<loop $searchhost>                        <!-- build URL search list -->
<loop $searchname $liveurls $bases $imports>
<if $searchname eq $searchhost>
<$acturls = $acturls $liveurls>
<$actbases = $actbases $bases>
<$actimports  = $actimports $imports>
</if>
</loop>
</loop>
<urlcp timeout 30>     <!-- dont wait longer than N seconds for results -->
<urlcp maxpgsize 2000000><!-- default is 512KB -->
<if $cookies ne ''>
    <!-- some servers require cookies to get predictable results -->
    <urlcp cookiejar $cookies>
</if>
<!--
<urlcp useragent 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'>
-->
<urlcp useragent 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'>
<!-- Lycos crashes without Accept-Language header as of 2012-04-13 -->
<urlcp header "Accept-Language" "en-us,en;q=0.5">
<flush>
<fmtcp query "%mIH" $q>
<fetch PARALLEL urls=$acturls $actbases $searchhost $actimports>
<$rawhtml=$ret>
<sandr $removeme "" $rawhtml>
<$html=$ret>
<hr width=550 align=left>
<big><b>Results from <a href=$acturls>$searchhost</a></b></big>
    <dl></mm></sb>
    <timport max=10 $actimports $html>     <!-- Parse and print results -->
          <local itemNext=$next>
          <sandr $removemepost "" $Title><$Title=$ret>
          <substr $Link 0 1>
          <if $ret eq "/">                     <!-- Prepend host if needed -->
            <substr $Link 1 -1>
            <strfmt "%s%s" $actbases $ret>
            <$Link = $ret>
          </if>
          <rex "%3A" $Link />
          <if $ret ne "">                      <!-- decode url encoded link -->
             <strfmt "%!U" $Link><$Link=$ret>
          </if>
          <substr $Link 0 7>
          <if $ret ne "http://">
             <substr $Link 0 8>
             <if $ret ne "https://">
                <strfmt "http://%s" $Link>
                <$Link = $ret>
             </if>
          </if>
          <p>
          <mm>
          <dt>$itemNext:
          <fmt '<a href="%s">%s</a>' $Link $Title>
             <dd><lower $Abstract><sandr '<=/?>>\alpha+[^>]*>' '' $ret><send $ret>
             <dd><tt>$Link</tt>
          </mm>
    </timport>
    </dl>
    <flush>
    <sum "%s" $SERVER_ROOT "/logs/meta-queries.log">
    <write append $ret>
       <fmt "%at " "%Y-%m-%d %H:%M:%S" now>$REMOTE_ADDR $searchhost $loop $q
    </write>
    <if $loop eq 0>
       $searchhost returned no answers.<br><small><a href=$acturls>$acturls</a></small><p>
       <if $saiddisclaimer eq "">
          <$saiddisclaimer=y>
          <i>Note: Search providers sometimes change their input forms
             and/or search results formats. This code example is not monitored
             as closely as a real application would be. If the above Url
             gives answers it simply means that the search provider has
             changed their format. This code example will eventually be updated
             to handle the new format.</i>
       </if>
       <sum "%s" $SERVER_ROOT "/logs/meta-noanswers.log">
       <write append $ret>
          <fmt "%at " "%Y-%m-%d %H:%M:%S" now>$REMOTE_ADDR $searchhost $acturls
       </write>
       <strfmt "/tmp/meta-noanswers.%s.rawhtml" $searchhost>
       <write $ret><fmt "%s" $rawhtml></write>
       <strfmt "/tmp/meta-noanswers.%s.html" $searchhost>
       <write $ret><fmt "%s" $html></write>
       <if $debug neq "">      <!-- Use this if you're adding a new Engine -->
	  <b>DEBUG:</b>
	  <pre>
	  $searchhost
	  $acturls
	  $actimports
	  <small>
	  $html
	  </small>
	  </pre>
       </if>
    </if>                        
    </sep>
    <flush>
  </fetch>
</a>

<!-------------------- Warn about Copyright violations ----------------------->

<a name=caveat>
   <small><i>Notice:</i>
   <a href="https://www.thunderstone.com/">Thunderstone</a> has provided this
   software for reference purposes only. It is the user's responsibility to ensure 
   conformance with the usage policies of the individual content providers.
   </small>
</a>

<!--------------------- Display the search form ------------------------------>

<a name=searchform>
   <if $searchhost eq "">
    <$searchhost=$searchname>
   </if>
   <table border=0 width=550 cellpadding=0 cellspacing=0>
   <form method=post action=$url/main.html>
   <tr><td align=right>
     <b>Search For:</b>
   </td><td align=left>
     <input name=q value="$q" size=45><input type=submit value="go">
   </td></tr>
   <tr bgcolor=#c0c0c0><td align=right>
     <b>On:</b>
   </td><td align=left valign=top><small>
    <checkbox "searchhost" $searchname $searchhost $searchname>
    </small>
   </td></tr>
   </form>
   </table>  
</a>

<!-----------------------------------------------------------------------------
  <Init> sets up the list of available engines and their affiliated parsers.
  If the format of the result set changes for a engine, the TIMPORT 
  specification for that engine will need to be replaced.
------------------------------------------------------------------------------>


<a name=init>           <!-- This sets up all of the lists we need later on -->
   
  <$removeme= "\x0d"
              "<b>" ">><b =[^>]*>" "</b>"
              "<em=[^>]*>" "</em>"
              "<strong=[^>]*>" "</strong>"
              "\?utm_content\=params%3A=[^\x22]*"
  ><!-- List of things to remove before parsing -->
  <$removemepost= "<h3=[^>]*>" "</h3>"
                  "<span=[^>]*>" "</span>"
                  "<cite=[^>]*>" "</cite>"
  ><!-- List of things to remove from title after parsing -->
  <!-- Could add per-site boilerplate removal to remove promoted results
       as they are sometimes formatted the same as organic results
  -->
  <!-- Removed Yahoo since they use Bing data -->
  <!-- 2023-09-13 - Remove Bing as it's mostly ads, few if any natural results -->
  <!-- 2015-06-03 - Remove lycos as it requires a primer call to set keyvol -->
  <$searchurl =
  "https://www.ask.com/web?q=xyzzy"
  "https://www.google.com/search?q=xyzzy"
  "http://search.thunderstone.com/texis/websearch15/?q=xyzzy&max=10&w3meta=1"
  >
   <$searchname =
  "Ask"
  "Google"
  "Thunderstone"
  >
   <$bases = 
  "http://www.ask.com/"
  "https://www.google.com/"
  "http://search.thunderstone.com/"
  >
 
 
  <!-- See TIMPORT DOCUMENTATION for more details on how the imports work -->  
  <!-- See REX DOCUMENTATION for more on our regular expression syntax -->  
<$imports= 

'#Ask
multiple
recexpr >>\{"abstract":"=!",*",=!,"title":+,"title":"=!",*","url":"=!",*",
#                       1   2  3          4          5   6         7   8
#       Name            Type            Tag
field   Link            varchar(40)     8
field   Title           varchar(80)     6
field   Abstract        varchar(180)    2
'

'#Google
multiple
recexpr >><div class\="yuRUbf"[\x20>]=!<a*<a =!href+href\=="?[^">]+"?[^>]*>=!<h3+<h3=[^>]*>=!</h3>+</h3>=!<div class\="VwiC3b+<div class\="VwiC3b=!<span>*<span>=!</span>+</span></div>
#                                    1   2         3 4     5 6    7 8    9   10   1112     13    14                   15                   16      17     18       19
#       Name            Type            Tag
field   Link            varchar(40)     7
field   Title           varchar(80)     15
field   Abstract        varchar(180)    21
'

'#Thunderstone
multiple
recexpr >><dt>=[^<\x0a]+<a href\==[^>]+>=[^<\x0a]+</a><dd>=[^<\x0a]+<tt>=[^<\x0a]+</tt><br><i>=[^<\x0a]+</i><p>=
#             1        2         3    4 5        6        7        8    9       10           11       12      13
#       Name            Type            Tag
field   Link            varchar(40)     4
field   Title           varchar(80)     6
field   Abstract        varchar(180)    8
'
>    <!-- end of $imports -->

  <!-- some servers require cookies to get predictable results -->
  <$cookies=
  >


</a> 
<!-- End of the <init> function -->

<a name=REMOVE_ME>
<table border=0 cellpadding=0 cellspacing=0 width=100%>
   <tr>
     <td nowrap><tt>
         <small><tt>
<!--
         <a target=_top href="/texis/site/demos/download/main.txt?x=$SiteUrlPrefix$'pathroot'.vs">[Download]</a>
-->
         <a target=_top href="/texis/site/demos/metaparse/">[Build a new Parser]</a>
	 </tt></small>
     </td>
   </tr>
</table>
</a>
 

</script>

Back to the Code Example List

The Source Viewer is also a Code Example.
Click Here to see its source.
Copyright © 1992-1999 Thunderstone Software
Copyright © 2024 Thunderstone Software LLC. All rights reserved.