Texis Web Script source code for:
https:///texisdocs.thunderstone.com
Note: Click on links to view the documentation for HTML tags
which are special to the Vortex compiler.
<script language=vortex>
<timeout=35></timeout>
<uses demonav=demonav>
<!----------------------------------------------------------------------------
Install Notes:
You must either have Texis 2.5+ or Webinator 2.5+ installed to run this.
Webinator may be downloaded from http://www.thunderstone.com/texis/site/pages/webinator.html
1: Save this file under the filename "meta" (no extension) in your
~htdocs/webinator directory and point your browser at
http://www.myserver.com/cgi-bin/texis/webinator/meta/
2: In the <main> function there's a tag that says <REMOVE_ME>. Do it.
3: If a Search engine changes its display format or you want to add
a new engine, you'll need to edit the variables in the <init> function.
4: There's a helper app at http://www.thunderstone.com/texis/site/demos/metaparse/
that will aid in writing new parse expressions.
------------------------------------------------------------------------------>
<!----------- The main entry point of this script --------------------------->
<a name=main PUBLIC>
<demolook title="Meta Search">
<REMOVE_ME>
<init>
<searchform>
<if $q neq "">
<netsearch>
</if>
<hr>
<caveat>
</demolook>
</a>
<!----------- Do the Metasearch and show results ---------------------------->
<a name=netsearch>
<strfmt "%U" $q> <!-- URL-escape query -->
<sandr "[\?#\{\}\+\\]" "\\\1" $ret> <!-- escape sandr replace chars -->
<sandr "xyzzy" $ret $searchurl> <!-- put the query in the URLs -->
<$fetchthis=$ret>
<$liveurls = $ret>
<$acturls = >
<$actbases = >
<$actimports = >
<loop $searchhost> <!-- build URL search list -->
<loop $searchname $liveurls $bases $imports>
<if $searchname eq $searchhost>
<$acturls = $acturls $liveurls>
<$actbases = $actbases $bases>
<$actimports = $actimports $imports>
</if>
</loop>
</loop>
<urlcp timeout 30> <!-- dont wait longer than N seconds for results -->
<urlcp maxpgsize 2000000><!-- default is 512KB -->
<if $cookies ne ''>
<!-- some servers require cookies to get predictable results -->
<urlcp cookiejar $cookies>
</if>
<!--
<urlcp useragent 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'>
-->
<urlcp useragent 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'>
<!-- Lycos crashes without Accept-Language header as of 2012-04-13 -->
<urlcp header "Accept-Language" "en-us,en;q=0.5">
<flush>
<fmtcp query "%mIH" $q>
<fetch PARALLEL urls=$acturls $actbases $searchhost $actimports>
<$rawhtml=$ret>
<sandr $removeme "" $rawhtml>
<$html=$ret>
<hr width=550 align=left>
<big><b>Results from <a href=$acturls>$searchhost</a></b></big>
<dl></mm></sb>
<timport max=10 $actimports $html> <!-- Parse and print results -->
<local itemNext=$next>
<sandr $removemepost "" $Title><$Title=$ret>
<substr $Link 0 1>
<if $ret eq "/"> <!-- Prepend host if needed -->
<substr $Link 1 -1>
<strfmt "%s%s" $actbases $ret>
<$Link = $ret>
</if>
<rex "%3A" $Link />
<if $ret ne ""> <!-- decode url encoded link -->
<strfmt "%!U" $Link><$Link=$ret>
</if>
<substr $Link 0 7>
<if $ret ne "http://">
<substr $Link 0 8>
<if $ret ne "https://">
<strfmt "http://%s" $Link>
<$Link = $ret>
</if>
</if>
<p>
<mm>
<dt>$itemNext:
<fmt '<a href="%s">%s</a>' $Link $Title>
<dd><lower $Abstract><sandr '<=/?>>\alpha+[^>]*>' '' $ret><send $ret>
<dd><tt>$Link</tt>
</mm>
</timport>
</dl>
<flush>
<sum "%s" $SERVER_ROOT "/logs/meta-queries.log">
<write append $ret>
<fmt "%at " "%Y-%m-%d %H:%M:%S" now>$REMOTE_ADDR $searchhost $loop $q
</write>
<if $loop eq 0>
$searchhost returned no answers.<br><small><a href=$acturls>$acturls</a></small><p>
<if $saiddisclaimer eq "">
<$saiddisclaimer=y>
<i>Note: Search providers sometimes change their input forms
and/or search results formats. This code example is not monitored
as closely as a real application would be. If the above Url
gives answers it simply means that the search provider has
changed their format. This code example will eventually be updated
to handle the new format.</i>
</if>
<sum "%s" $SERVER_ROOT "/logs/meta-noanswers.log">
<write append $ret>
<fmt "%at " "%Y-%m-%d %H:%M:%S" now>$REMOTE_ADDR $searchhost $acturls
</write>
<strfmt "/tmp/meta-noanswers.%s.rawhtml" $searchhost>
<write $ret><fmt "%s" $rawhtml></write>
<strfmt "/tmp/meta-noanswers.%s.html" $searchhost>
<write $ret><fmt "%s" $html></write>
<if $debug neq ""> <!-- Use this if you're adding a new Engine -->
<b>DEBUG:</b>
<pre>
$searchhost
$acturls
$actimports
<small>
$html
</small>
</pre>
</if>
</if>
</sep>
<flush>
</fetch>
</a>
<!-------------------- Warn about Copyright violations ----------------------->
<a name=caveat>
<small><i>Notice:</i>
<a href="https://www.thunderstone.com/">Thunderstone</a> has provided this
software for reference purposes only. It is the user's responsibility to ensure
conformance with the usage policies of the individual content providers.
</small>
</a>
<!--------------------- Display the search form ------------------------------>
<a name=searchform>
<if $searchhost eq "">
<$searchhost=$searchname>
</if>
<table border=0 width=550 cellpadding=0 cellspacing=0>
<form method=post action=$url/main.html>
<tr><td align=right>
<b>Search For:</b>
</td><td align=left>
<input name=q value="$q" size=45><input type=submit value="go">
</td></tr>
<tr bgcolor=#c0c0c0><td align=right>
<b>On:</b>
</td><td align=left valign=top><small>
<checkbox "searchhost" $searchname $searchhost $searchname>
</small>
</td></tr>
</form>
</table>
</a>
<!-----------------------------------------------------------------------------
<Init> sets up the list of available engines and their affiliated parsers.
If the format of the result set changes for a engine, the TIMPORT
specification for that engine will need to be replaced.
------------------------------------------------------------------------------>
<a name=init> <!-- This sets up all of the lists we need later on -->
<$removeme= "\x0d"
"<b>" ">><b =[^>]*>" "</b>"
"<em=[^>]*>" "</em>"
"<strong=[^>]*>" "</strong>"
"\?utm_content\=params%3A=[^\x22]*"
><!-- List of things to remove before parsing -->
<$removemepost= "<h3=[^>]*>" "</h3>"
"<span=[^>]*>" "</span>"
"<cite=[^>]*>" "</cite>"
><!-- List of things to remove from title after parsing -->
<!-- Could add per-site boilerplate removal to remove promoted results
as they are sometimes formatted the same as organic results
-->
<!-- Removed Yahoo since they use Bing data -->
<!-- 2023-09-13 - Remove Bing as it's mostly ads, few if any natural results -->
<!-- 2015-06-03 - Remove lycos as it requires a primer call to set keyvol -->
<$searchurl =
"https://www.ask.com/web?q=xyzzy"
"https://www.google.com/search?q=xyzzy"
"http://search.thunderstone.com/texis/websearch15/?q=xyzzy&max=10&w3meta=1"
>
<$searchname =
"Ask"
"Google"
"Thunderstone"
>
<$bases =
"http://www.ask.com/"
"https://www.google.com/"
"http://search.thunderstone.com/"
>
<!-- See TIMPORT DOCUMENTATION for more details on how the imports work -->
<!-- See REX DOCUMENTATION for more on our regular expression syntax -->
<$imports=
'#Ask
multiple
recexpr >>\{"abstract":"=!",*",=!,"title":+,"title":"=!",*","url":"=!",*",
# 1 2 3 4 5 6 7 8
# Name Type Tag
field Link varchar(40) 8
field Title varchar(80) 6
field Abstract varchar(180) 2
'
'#Google
multiple
recexpr >><div class\="yuRUbf"[\x20>]=!<a*<a =!href+href\=="?[^">]+"?[^>]*>=!<h3+<h3=[^>]*>=!</h3>+</h3>=!<div class\="VwiC3b+<div class\="VwiC3b=!<span>*<span>=!</span>+</span></div>
# 1 2 3 4 5 6 7 8 9 10 1112 13 14 15 16 17 18 19
# Name Type Tag
field Link varchar(40) 7
field Title varchar(80) 15
field Abstract varchar(180) 21
'
'#Thunderstone
multiple
recexpr >><dt>=[^<\x0a]+<a href\==[^>]+>=[^<\x0a]+</a><dd>=[^<\x0a]+<tt>=[^<\x0a]+</tt><br><i>=[^<\x0a]+</i><p>=
# 1 2 3 4 5 6 7 8 9 10 11 12 13
# Name Type Tag
field Link varchar(40) 4
field Title varchar(80) 6
field Abstract varchar(180) 8
'
> <!-- end of $imports -->
<!-- some servers require cookies to get predictable results -->
<$cookies=
>
</a>
<!-- End of the <init> function -->
<a name=REMOVE_ME>
<table border=0 cellpadding=0 cellspacing=0 width=100%>
<tr>
<td nowrap><tt>
<small><tt>
<!--
<a target=_top href="/texis/site/demos/download/main.txt?x=$SiteUrlPrefix$'pathroot'.vs">[Download]</a>
-->
<a target=_top href="/texis/site/demos/metaparse/">[Build a new Parser]</a>
</tt></small>
</td>
</tr>
</table>
</a>
</script>
The Source Viewer is also a Code Example.
Click Here to see its source.
Copyright © 1992-1999 Thunderstone Software