#!/net/u/1/f/fdc/kermit/wermit +
#
# ksitemap - sitemap.xml file constructor
#
.version = 1.03         # Version of this script
.testingdest = ~/tmp/   # Result directory for testing - change if necessary
#
# Builds a sitemap.xml file for a website, with Google image extensions.
# Requires: C-Kermit 9.0 Alpha.03 or later.
# This file should be stored with execute permission.
# Top line must give full pathname of C-Kermit 9.0 executable.
#
# Documentation: http://www.kermit.edu/ksitemap.html
#
# Optional command-line argument: path of filelist file.  If the argument is
# given the web directory is assumed to be the same directory where the
# filelist is.  If not given, a file named "filelist" in the current directory
# is assumed if it exists.  For details see the documentation or read below.
#
# Data file summary... ON TOP:
# enconding=value -- The character set in which the fileliest is encoded
# home=value -- URL of website home directory (required)
# geo=value  -- String: Default location for images (optional)
# lic=value  -- Filename of page containing copyright/license info (optional)
# .xxx=value -- Define a macro named xxx (optional)
#
# REST OF FILE: Information for each page to be included in sitemap:
# url=value[=value]  -- Filename of web page (required, second value optional)
# pri=value  -- Priority for indexing (0.0 to 1.0) (optional)
#
# AND FOR EACH IMAGE IN A URL THAT YOU WANT TO BE INDEXED:
# img=value  -- Filename of an image used in this web page (optional)
# cap=value  -- Caption for this image (optional)
# title=value - Title for this image (optional)
# (You can list as many images as you like for each URL, within reason)
#
# Version history:
# 1.00 Wed Dec  8 10:51:32 2010: Initial version
# 1.01 Sat Dec 11 19:02:20 2010: Fix bug in change frequency calculation
# 1.02 Tue Dec 28 15:18:15 2010: Allow for redirects
# 1.03 Thu Dec 30 09:51:46 2010: Add macro capability
#
# Author: Frank da Cruz, December 2010.
#
if llt \v(version) 900299 exit 1 "C-Kermit 9.0 or later required"

if def \$(DEBUG) set debug message on   # DEBUG env variable requests debugging
.unix = 0                               # For "if unix ..."
if equ "\v(system)" "UNIX" .unix = 1
.usedenv = 0

def errexit {                           # Fatal error macro
    echo \v(timestamp) \v(dir) sitemap.ksc:
    echo Error: \%*
    exit 1
}
if def \%1 {                            # Command-line argument if any
    .webdirectory := \fdirname(\%1)     # is pathname of file list file.
    if def webdirectory {               # If it includes a directory part
        if not directory \m(webdirectory) { # Check it
            errexit NOT A DIRECTORY: \m(webdirectory)
        }
        cd \m(webdirectory)             # and CD to it
        if fail errexit CD FAILED: \m(webdirectory)
    }
    .filelist := \fbasename(\%1)        # And this is the name of the file
    if not def filelist .filelist = filelist
} else if def \$(KSITEMAPDIR) {         # Env variable KSITEMAPDIR exists
    .webdirectory := \$(KSITEMAPDIR) 
    if not directory \m(webdirectory) { # Check it
        errexit "NOT A DIRECTORY: \m(webdirectory) [From $KSITEMAPDIR]"
    }
    cd \m(webdirectory)                 # and CD to it
    if fail errexit "CD \m(webdirectory) [From $KSITEMAPDIR]"
    .filelist = filelist                # And the file-list file is filelist
    .usedenv = 1
} else {                                # Otherwise
    .webdirectory := \v(dir)            # assume the current directory
    .filelist = filelist                # And default the filename to filelist
}
.resultdirectory := \m(webdirectory)    # Where to put sitemap.xml

if debug {                              # Debugging
    .resultdirectory := \m(testingdest)
    echo DEBUGGING...
    echo \fbasename(\%0) V\m(version)
    if usedenv echo Parameters obtained from $KSITEMAPDIR environment variable:
    show mac webdirectory filelist
    echo Writing result to \m(testingdest)\m(filelist)
    echo current directory is \v(dir)
}
if not exist \m(filelist) {             # Check that the file list file exists
    errexit FILE LIST NOT FOUND: \m(webdirectory)\m(filelist)
}
# Define some macros...

define FERREXIT {                       # Fatal error reading file list file
    exit 1 [\flpad(\m(lineno),3,0)] \%1 [\m(line)]
} 
define FERRWARN {                       # Warning about a file list line
    echo [\flpad(\m(lineno),3,0)] \%1 [\m(line)]
}
define FINISHIMAGE {                    # Macro to write Image epilog
    if inimg {                          # If we were doing an image...
        if def geo {                    # if location defined
            .\%9 := <image:geo_location>\m(geo)</image:geo_location>
            fwrite /line \%o "    \%9"  # add it.
        }
        if def lic {                    # If license URL defined
            .\%9 := <image:license>\m(home)\m(lic)</image:license>
            fwrite /line \%o "    \%9"  # Add it
        }
        fwrite /line \%o "  </image:image>" # Close image clause
        .inimg = 0                      # No longer doing an image
    }
}
def FINISHURL {                         # Macro to write URL epilog
    if not inurl end 0
    if > imginurl 0 {                   # Were there some images in this URL?
        finishimage                     # Finish current image
    } else {                            # No images - write priority now
        fwrite /line \%o "  <priority>\m(priority)</priority>"
    }
    fwrite /line \%o </url>             # End of this URL
    .inurl = 0                          # No longer doing a URL
}
# Begin execution...

fopen /read \%c \m(filelist)            # Open the file-list file
if fail errexit "\v(lastcommand)"       # Make sure it is open

fopen /write \%o sitemap.tmp            # Open the temporary sitemap file.
if fail errexit "OPEN /WRITE FAILED"    # Check

# Write XML prolog to sitemap file...
fwrite /line \%o <?xml version="1.0" encoding="UTF-8"?> # First line
if fail errexit "WRITE FAILED: sitemap.tmp" # Check that FWRITE succeeded

# If we get here all writes should succeed - continue the XML prolog...
fwrite /line \%o <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
fwrite /line \%o -
{ xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">}

.tags = |url|pri|img|cap|title|home|geo|lic|encoding|  # Valid tags
.urls = 0                               # Initialize URL counter
.imgs = 0                               # Image counter
.imginurl = 0                           # Image within URL counter
.inurl = 0                              # State flag: doing a URL
.inimg = 0                              # State flag: doing an image (in a URL)
.lineno = 0                             # File list file line number
.doutf8 = 0                             # Convert encoding to UTF-8
.mjd := \fmjd(today)                    # Today's date MJD format
.encoding = UTF-8                       # Default encoding for file-list file
.home =                                 # Web home directory (none yet)
.globalgeo =                            # Global geographic location (ditto)
.globallic =                            # Global license page (ditto)

while true {                            # Loop to read and process file list
    fread /line /trim \%c line          # Read a line and trim trailing blanks
    if fail break                       # Fail = end of file = all done
    increment lineno                    # Count this line
    if not defined line continue        # If empty line read the next one
    .line := \fltrim(\m(line))          # Trim leading blanks
    if equ "\s(line[1:1])" "#" continue # If it's a comment line keep reading
    if equ "\v(version)" "900299" {	# If C-Kermit is 9.0.299
	if not lgt "\v(test)" "Alpha.09" { # Alpha.09 or earlier...
	    # Work around "Quoting Hell" bug in CSV splitting
	    if \findex(\\,\m(line)) .line := \freplace(\m(line),\\,\\\\)
	}
    }
    .\%9 := \fsplit(\m(line),&x,=,CSV)	# Split line on '='
    if < \%9 2 { ferrwarn "TAG WITH NO VALUE", continue }
    undef s1 s2 s3
    .s1 := \&x[1]                       # Tag
    .s2 := \fcontents(\&x[2])		# Value
    if def \&x[3] .s3 := \fcontents(\&x[3]) # Optional second value (redirect)
    if doutf8 {				# Converting character set?
        if not equ "7BIT" "\fstringtype(\m(s2))" { # Need to convert this one?
            .s2 := \fcvtcset(\m(s2),\m(encoding),utf-8)	# Convert to UTF-8
        }
    }
    if ( > \flen(s1) 1 && equ "\s(s1[1:1])" "." ) { # Macro definition
        _asg \s(s1[2]) \m(s2)		# See Using C-Kermit p.457
        if debug {
            message MACRO DEFINITION
            show mac \s(s1[2])
        }
        continue
    }
    if not \findex(|\m(s1)|,\m(tags)) { # Preverify tag
        ferrwarn "UNKNOWN TAG '\m(s1)'- SKIPPING"
        continue
    }
    if match "\m(s2)" "*\\m(*)*" {	# Check for macro reference in s2
        .s2 := \frecurse(\m(s2))	# Replace macro with its expansion
        if debug { echo s2 MACRO EXPANSION, show mac s2 }
    }
    if match "\m(s3)" "*\\m(*)*" {	# Check for macro reference in s2
        .s3 := \frecurse(\m(s3))
         if debug { echo s3 MACRO EXPANSION, show mac s3 }
    }
    # Handle each kind of tag...

    if equ "\m(s1)" "encoding" {        # Encoding of filelist file
        .encoding := \m(s2)             # Save it here
        message Encoding=\m(encoding)
        if not equ "\m(s2)" "UTF-8" .doutf8 = 1 # Must convert to UTF-8
        continue
    }
    if equ "\m(s1)" "home" {            # Website home directory
        # In Unix supply trailing slash if necessary
        if unix if neq "\fright(\m(s2),1)" "/" .s2 := \m(s2)/
        .home := \m(s2)                 # to be used in building URLs
        continue
    }
    if equ "\m(s1)" "geo" {             # Image geographic location
        if == 0 urls {                  # If geo given at the head of filelist
            .globalgeo := \m(s2)        # make it the global default value
        } else {
            .geo := \m(s2)              # set the local value
        }
        continue
    }
    if equ "\m(s1)" "lic" {             # Website license page
        if == 0 urls {                  # Same as for geo
            .globallic := \m(s2)
        } else {
            .lic := \m(s2)
        }
        continue
    }
    if equ "\m(s1)" "url" {             # Web page URL
        if not def home errexit "URL BEFORE HOME DEFINED"
        if inurl do finishurl           # Finish previous URL if any
        .imginurl = 0                   # How many images in this URL
        .priority = 0.5                 # Default page priority (0.0-1.0)
        .name := \m(s2)                 # Filename of this web page
        .redirect := \m(s3)		# Name (if any) it is redirected to
        if exist \m(s3) {		# If redirect is indicated
            .redirect := \m(name)	# swap the names
            .name := \m(s3)
        }
        if not exist \m(name) { ferrwarn "NOT EXIST \m(name)", continue }
        if not readable \m(name) { ferrwarn "NOT READABLE \m(name)" }
        .inurl = 1                      # We are doing a URL now

        message \m(name)...             # List the name if debugging
        incr urls                       # Count the URL
        fwrite /line \%o <url>          # Start the XML URL section
        # Add URL of this file to sitemap...
        if eq "\m(name)" "index.html" {	# Special for home page
            fwrite /line \%o "  <loc>\m(home)</loc>"
        } else if def redirect {	# Special for redirected names
            fwrite /line \%o "  <loc>\m(home)\m(redirect)</loc>"
        } else {			# Normal case
            fwrite /line \%o "  <loc>\m(home)\m(name)</loc>"
        }
        .s := \fcvtd(\fdate(\m(name)),3) # Modification date of file
        .s := \s(s[1:4])-\s(s[5:2])-\s(s[7:2]) # Just the date is enough
        fwrite /line \%o "  <lastmod>\m(s)</lastmod>" # Add to sitemap
        .\%x := \fmjd(\fdate(\m(name))) # Modification date as MJD
        .\%y ::= \fmjd(\m(today)) - \%x	# How many days ago
        .c = yearly                     # Default change frequency is yearly
        if < \%y 8 .c =  daily          # If modified in last 7 days daily
        else if < \%y 30 .c =  weekly   # or in last 30 days say weekly
        else if < \%y 100 .c =  monthly # or in last 100 days say monthly
        fwrite /line \%o "  <changefreq>\m(c)</changefreq>" # Add to sitemap
        continue
    }
    if equ "\m(s1)" "pri" {             # Page priority
        if not inurl ferrexit "PRIORITY NOT IN URL"
        if not float \m(s2) ferrexit "PRIORITY NOT NUMERIC"
        if ( > \m(s2) 1.0 || < \m(s2) 0.0 ) ferrexit "PRIORITY OUT OF RANGE"
        .priority := \m(s2)             # Save it for epilog (see finishurl)
        continue
    }
    if equ "\m(s1)" "img" {             # Image
        finishimage                     # Finish previous image if any
        if == 0 imginurl {              # First image for this URL?
            # Add page priority before listing any images
            fwrite /line \%o "  <priority>\m(priority)</priority>"
        }
        .geo := \m(globalgeo)           # If a global one defined use it
        .lic := \m(globallic)           #  wherever a local one is not given.
        if not inurl ferrexit "img not in url"
        if not exist \m(s2) { ferrwarn "IMG NOT EXIST: \m(s2)", continue }
        if not readabl \m(s2) { ferrwarn "IMG NOT READABLE: \m(s2)", continue }
        increment imgs                  # Count this image
        incr imginurl                   # Count image for this URL
        fwrite /line \%o "  <image:image>" # Start image clause
        fwrite /line \%o "    <image:loc>\m(home)\m(s2)</image:loc>" # Put URL
        .inimg = 1                      # We are doing an image now
        continue
    }
    if equ "\m(s1)" "cap" {             # Image caption
        if not inimg ferrexit "CAP WITH NO IMG"
        if def s2 {                     # If the caption is not empty add it
            fwrite /line \%o "    <image:caption>\m(s2)</image:caption>"
        }
        continue
    }
    if equ "\m(s1)" "title" {           # Image title
        if not inimg ferrexit "TITLE WITH NO IMG"
        if def s2 {                     # If the title is not empty add it
            fwrite /line \%o "    <image:title>\m(s2)</image:title>"
        }
    }
}
do finishurl                            # End of file list - finish last URL
fwrite /line \%o </urlset>              # Finish the sitemap
fclose \%o                              # Close the temporary sitemap file
if exist sitemap.xml {                  # Rotate previous ones
    if exist sitemap.ayer copy /preserve sitemap.ayer sitemap.ante
    if fail message "FAILURE TO ROTATE OLD SITEMAP[1]"
    copy /preserve sitemap.xml sitemap.ayer
    if fail message "FAILURE TO ROTATE OLD SITEMAP[2]"
}
rename sitemap.tmp \m(resultdirectory)sitemap.xml # Install the new sitemap
if fail errexit "FAILURE TO INSTALL NEW SITEMAP"
if unix {                               # Unix...
    chmod 644 \m(resultdirectory)sitemap.xml # Make it world readable
    if fail errexit "CHMOD FAILURE - \m(resultdirectory)sitemap.xml"
}
# When run in a cron job this message arrives in email
exit 0 "[\v(timestamp)] sitemap.ksc: OK - URLs: \m(urls); IMGs: \m(imgs)"

; Local Variables:
; comment-column:40
; comment-start:"# "
; End:
