#!/bin/sh -e
#
# # DASHT-QUERY-HTML 1            2020-05-16                            2.4.0
#
# ## NAME
#
# dasht-query-html - searches [Dash] docsets and emits HTML table rows
#
# ## SYNOPSIS
#
# `dasht-query-html` [*PATTERN*] [*DOCSET*]...
#
# ### Examples
#
# `dasht-query-html`
#   Topics (A-Z) from each installed docset.
#
# `dasht-query-html` 'c - x'
#   Search for "c - x" in all installed docsets.
#
# `dasht-query-html` 'c - x' bash
#   Search for "c - x" only in the "bash" docset.
#
# `dasht-query-html` 'c - x' bash css
#   Search for "c - x" only in the "bash" and "css" docsets.
#
# ## DESCRIPTION
#
# Searches for *PATTERN* in all installed [Dash] docsets, optionally searching
# only in those whose names match *DOCSET*s, by calling dasht-query-line(1).
# The results are then printed, one per line, to stdout as HTML table rows.
# However, if no results were found, this program exits with a nonzero status.
#
# ### Searching
#
# Whitespace characters in *PATTERN* are treated as wildcards, whereas the
# SQL LIKE wildcard characters `%` and `_` are not: they are taken literally.
#
# Before searching, *PATTERN* is surrounded by whitespace wildcards so that it
# can match anywhere: beginning, middle, or end.  As a result, if *PATTERN* is
# undefined, it becomes a whitespace wildcard and thereby matches everything.
#
# ## ENVIRONMENT
#
# `DASHT_DOCSETS_DIR`
#   Defines the filesystem location where your [Dash] docsets are installed.
#   If undefined, its value is assumed to be `$XDG_DATA_HOME/dasht/docsets/`
#   or, if `XDG_DATA_HOME` is undefined, `$HOME/.local/share/dasht/docsets/`.
#
# ## EXIT STATUS
#
# 44
#   No results were found.
#
# ## SEE ALSO
#
# dasht-query-line(1), dasht-docsets(1), dasht(1), [Dash]
#
# [Dash]: https://kapeli.com/dash
#
# ## AUTHOR
#
# Written in 2016 by Suraj N. Kurapati <https://github.com/sunaku/dasht>
# Distributed under the terms of the ISC license (see the LICENSE file).

trap 'exit 44' USR1 # exit with a nonzero status when no results found
{ dasht-query-line "$@" || kill -s USR1 $$ ;} | awk -v pattern="$1" '
  # Escapes XML "predefined entities" in given parameter.
  # See http://www.w3.org/TR/REC-xml/#sec-predefined-ent
  function escape(xml) {
    gsub("&",    "\\&amp;",  xml)
    gsub("\"",   "\\&quot;", xml)
    gsub("\047", "\\&#39;",  xml) # https://www.w3.org/TR/xhtml1/#C_16
    gsub("<",    "\\&lt;",   xml)
    gsub(">",    "\\&gt;",   xml)
    return xml
  }
  # Inserts the given breaker at the end of capitalized words, runs of
  # lowercase text, numbers, and punctuation marks in the given string.
  # Characters in the extras string are considered to be part of words.
  function wordbreak(string, breaker, extras) {
    gsub("[" extras "[:upper:]]+[" extras "[:lower:]]*|"\
         "[" extras "[:lower:]]+|"\
         "[" extras "[:digit:]]+|"\
         "[" extras "[:punct:]]+", "&" breaker, string)
    return string
  }
  # Try to fetch the answer from a cache to avoid repeating the same work.
  function wordbreak_cached(key, breaker, extras) {
    if (val = wordbreak_cache[key]) {}
    else { val = wordbreak_cache[key] = wordbreak(key, breaker, extras) }
    return val
  }
  # Transforms alphabetical characters into bracketed regular expressions
  # that match either lowercase or uppercase versions of those characters.
  # This basically emulates the IGNORECASE feature in a POSIX environment.
  function ignorecase(regex) {
    buf = ""
    tmp = regex
    while (pos = match(tmp, "[[:alpha:]]")) {
      chr = substr(tmp, pos, 1)
      buf = buf substr(tmp, 1, pos - 1) "[" tolower(chr) toupper(chr) "]"
      tmp = substr(tmp, pos + 1)
    }
    return buf tmp
  }
  BEGIN {
    gsub("[\\^.[$()|*+?{]", "\\\\&", pattern) # escape normal regex(7) syntax
    sub("^[[:space:]]+", "", pattern)         # strip leading whitespace
    sub("[[:space:]]+$", "", pattern)         # strip trailing whitespace
    gsub("[[:space:]]+", ".*", pattern)       # treat whitespace as wildcards
    pattern = ignorecase(pattern)             # emulate IGNORECASE=1 for POSIX
    if (pattern == "") pattern = "^."         # grouped by leading character
  }
  NR == 1      { print "<table>" }
  $2 == "="    { result[$1] = substr($0, index($0, $2) + length($2) + 1) }
  $1 == "from" { result["from"] = wordbreak_cached(result["from"], "<wbr>") }
  $1 == "name" {
    # mark search terms with STX and ETX bytes which are ignored by escape()
    if (pattern) {
      gsub(pattern, "\002&\003", result["name"])
    }

    # mark word-wrappable points with VT bytes which are ignored by escape()
    result["name"] = wordbreak(result["name"], "\v", "\002\003")

    # escape XML entities in search result to make them visible in browsers
    result["name"] = escape(result["name"])

    # insert word-break opportunity <wbr> tags at points marked by VT bytes
    gsub("\v", "<wbr>", result["name"])

    # highlight search terms in search result using the STX and ETX markers
    if (pattern) {
      gsub("\002", "<b><i><u>", result["name"])
      gsub("\003", "</u></i></b>", result["name"])
    }
  }
  $1 == "url"  { print \
    "<tr>"\
      "<td><a href=\"" result["url"] "\">" result["name"] "</a></td>"\
      "<td valign=\"bottom\" align=\"right\">" result["from"] "</td>"\
      "<td valign=\"bottom\">" tolower(result["type"]) "</td>"\
    "</tr>"
  }
  END {
    if (NR > 0) {
      print "</table>"
      if (NR == 4) {
        # there was only one search result, so automatically visit its url
        print "<meta http-equiv=\"refresh\" content=\"0;url=" result["url"] "\">"
      }
    }
  }
'
