#!/bin/sh
#
# unhypermail - remove extra hypermail stuff that clogs up the index
#
# Usage: unhypermail temp-file text/html URL config
#  (only $1 actually used in this script)
#
# Actually called as external converter script from htdig...
# external_parsers:	text/html->text/html-internal /path/to/unhypermail
#
# For other mail archive indexing examples, see 
#  http://www.htdig.org/files/contrib/scripts/README.geoupdate-ungeoify
#  http://www.htdig.org/files/contrib/scripts/geoupdate.sh
#  http://www.htdig.org/files/contrib/scripts/ungeoify.sh

# select set of modifications to HTML based on footer in message...
if grep 'This archive was generated by .*hypermail ' $1 > /dev/null
then

 # hypermail archives... use received date of message as mod time
 # (this requires use of use_doc_date in htdig.conf),
 # don't index message listings themselves (no received date in these),
 # follow links to other messages but don't index the text in&around links,
 # use full subject line for title, instead of truncated one.
 sed -n -e 's|^<!-- received="... \(...\)  *\([0-9]*\) \(..:..:..\) \(....\).*|<meta name="date" content="\4 \1 \2 \3">|p' $1 |
	sed -e 's/Jan/01/; s/Feb/02/; s/Mar/03/; s/Apr/04/; s/May/05/; s/Jun/06/; s/Jul/07/; s/Aug/08/; s/Sep/09/; s/Oct/10/; s/Nov/11/; s/Dec/12/'
 grep '^<!-- received="' $1 > /dev/null || echo '<meta name="robots" content="noindex,follow">'
 sed -e 's|Messages sorted by:|<noindex follow>|' \
	-e 's|<!-- body="start" -->|</noindex>|' \
	-e 's|<!-- body="end" -->|<noindex follow>|' \
	-e '/^<TITLE>/d' \
	-e 's|^<META NAME="Subject" CONTENT="\(.*\)">|<title>\1</title>|' $1

else

 # index any other HTML file as-is
 cat $1

fi
