hawk/t/bibtex-to-html.hawk

547 lines
20 KiB
Plaintext

# http://www.netlib.org/bibnet/tools/software/bibtex-to-html.awk
#
### ====================================================================
### @Awk-file{
### author = "Nelson H. F. Beebe",
### version = "1.02",
### date = "05 July 1997",
### time = "12:04:52 MDT",
### filename = "bibtex-to-html.awk",
### address = "Center for Scientific Computing
### Department of Mathematics
### University of Utah
### Salt Lake City, UT 84112
### USA",
### telephone = "+1 801 581 5254",
### FAX = "+1 801 581 4148",
### URL = "http://www.math.utah.edu/~beebe",
### checksum = "08699 482 2173 18348",
### email = "beebe@math.utah.edu (Internet)",
### codetable = "ISO/ASCII",
### keywords = "bibliography, BibTeX, HTML, World-Wide Web,
### WWW",
### supported = "yes",
### docstring = "This program converts BibTeX bibliographies
### to HTML, suitable for viewing on the
### World-Wide Web.
###
### The level of HTML produced is version 3.2,
### adopted 14-Jan-1997, and defined in the SGML
### document type definition (DTD) available at
###
### http://www.w3.org/MarkUp/Wilbur/HTML32.dtd
###
### and documented at
###
### http://www.w3.org/MarkUp/Wilbur/
### http://www.w3.org/TR/REC-html32.html
###
### HTML markup is added to provide hypertext
### links for:
###
### * all URLs in the BibTeX file, both in
### comments, and inside string values;
### * all bibliography entry crossref
### values;
### * all \cite{} references;
### * all @String{name = "value"} names.
###
### In addition, every BibTeX citation label in
### @Entry lines, and every @String name, will
### be marked as an HTML label, allowing
### hypertext links to each from elsewhere in
### the same HTML file, or from other HTML
### files. In particular, every bibliography
### entry can be directly referenced by
### hypertext links from anywhere on the
### Internet.
###
### Each such linkable-name will be displayed
### in bold text to draw attention to the fact
### that it can be directly referenced by a
### suitable URL. In principle, this should be
### an option that WWW browsers provide, but
### none that I have used currently do.
###
### Although no browsers to my knowledge yet
### provide the capability of partial
### downloading of HTML files, the possibility
### has been discussed for future versions of
### the HTTP protocol. Such support would make
### it possible to construct bibliographies in
### electronic documents as links to large
### bibliography database files, without the
### browser having to load the entire database,
### but just individual entries. Since these
### in turn can have URLs that point to other
### electronic sources of the publication, a
### reader could easily follow links from a
### publication to a bibliography and then to
### abstracts and to the complete original
### text. Some journals, such as the Digital
### Technical Journal (electronically accessible
### at http://www.digital.com:80/info/DTJ/home.html),
### already could offer this possibility.
###
### The Web browser user will see material that
### looks just like normal BibTeX entries,
### except that some portions may be
### highlighted to indicate hypertext links.
### However, window cut-and-paste actions will
### recover a BibTeX entry in a form suitable
### for pasting into another BibTeX file,
### without any need for further editing.
###
### This program assumes that the BibTeX
### bibliography is formatted in the style
### produced by bibclean, and that embedded
### URLs and "key = stringname" pairs are coded
### on a single line, so that simple pattern
### matching suffices to recognize text in need
### of additional HTML markup.
###
### Usage:
### nawk -f bibtex-to-html.awk \
### [-v PREFIX=prefix] [-v SUFFIX=suffix] \
### BibTeX-file(s)
###
### An input file with a filename of the form
### abc.xyz is output to a file named
### PREFIXabcSUFFIX. The default PREFIX is
### empty, and the default SUFFIX is ".html".
###
### If no file names are specified on the
### command line, then the PREFIX and SUFFIX
### settings are ignored, and input is read
### from stdin, and output is written to
### stdout, so that the program can be used in
### a UNIX pipeline.
###
### In the current version, no provision is
### made for splitting the output files into
### smaller pieces to speed network file
### transfer. While this would improve browser
### responsiveness over slow network
### connections, it would also significantly
### complicate hypertext link generation for
### this program, and seriously damage browser
### search capability within the bibliography
### file. Perhaps the solution will come in
### (a) browsers' adopting the netscape browser
### practice of displaying data as soon as
### enough to fill a screen is available, and
### (b) faster network connections.
###
### In the TUG bibliography collection at
### ftp://ftp.math.utah.edu/, bibliography
### file sizes range from 3K to 4700K, with an
### average of 370K. These are rather large,
### since typical WWW file sizes need to be
### about 16K or less for good responsiveness.
###
### The checksum field above contains a CRC-16
### checksum as the first value, followed by the
### equivalent of the standard UNIX wc (word
### count) utility output of lines, words, and
### characters. This is produced by Robert
### Solovay's checksum utility.",
### }
### ====================================================================
BEGIN \
{
######################################################################
VERSION = "1.02 [05-Jul-1997]" # <-- NB: Change this with each update!
######################################################################
PROGRAM = "bibtex-to-html"
UNSET_FILENAME = "/dev/unset"
LASTFILENAME = UNSET_FILENAME
_last_input_filename = UNSET_FILENAME
if (SUFFIX == "")
SUFFIX = ".html"
USER = ENVIRON["USER"]
if (USER == "")
USER = ENVIRON["LOGNAME"]
if (USER == "")
USER = "????"
"hostname" | getline HOSTNAME
"date" | getline DATE
# [01-Aug-2019] ypcat no longer available: replace by getent
# ("ypcat passwd | grep '^" USER ":' | awk -F: '{print $5}'") | getline PERSONAL_NAME
("getent passwd " USER " | " ARGV[0] " -F: '{print $5}'") | getline PERSONAL_NAME
if (PERSONAL_NAME == "")
##("grep '^" USER ":' /etc/passwd | awk -F: '{print $5}'") | getline PERSONAL_NAME
("grep '^" USER ":' /etc/passwd | " ARGV[0] " -F: '{print $5}'") | getline PERSONAL_NAME
# NB: " has become &#34; before this pattern is used
CROSSREF_EQUALS_LABEL_PATTERN = "^[ \t]*crossref[ \t]*=[ \t]*&#34;"
# Pattern to match a line like this:
# %%% email = "beebe at math.utah.edu (Internet)",
BIBTEX_EMAIL_PATTERN = "= &#34;[A-Za-z0-9-]+ at [A-Za-z0-9.-]+"
BIBTEX_EMAIL_OFFSET = 7 # was 8 before &quot; became &#34;
BIBTEX_EMAIL_PREFIX = "mailto:"
BIBTEX_EMAIL_SAVE_LABEL = 0
##CITE_PATTERN = "\\\\cite{[^}]+}"
CITE_PATTERN = "\\\\cite\\{[^\\}]+}"
CITE_OFFSET = 6
CITE_PREFIX = ""
CITE_SAVE_LABEL = 1
EMAIL_PATTERN = "[A-Za-z0-9-]+@[A-Za-z0-9.-]+"
EMAIL_OFFSET = 0
EMAIL_PREFIX = "mailto:"
EMAIL_SAVE_LABEL = 0
# See Nelson H. F. Beebe, ``Bibliography prettyprinting
# and syntax checking'', TUGboat 14(3), 222-222, October
# (1993), and 14(4), 395--419, December (1993) for the
# syntax of BibTeX names used here in ENTRY_PATTERN,
# KEY_EQUALS_NAME_PATTERN and STRING_PATTERN.
##ENTRY_PATTERN = "^[ \t]*@[ \t]*[A-Za-z][A-Za-z0-9:.+/'-]*[ \t]*{[A-Za-z][A-Za-z0-9:.+/'-]*,[ \t]*$"
ENTRY_PATTERN = "^[ \t]*@[ \t]*[A-Za-z][A-Za-z0-9:.+/'-]*[ \t]*\\{[A-Za-z][A-Za-z0-9:.+/'-]*,[ \t]*$"
KEY_EQUALS_NAME_PATTERN = "^[ \t]*[A-Za-z][A-Za-z0-9:.+/'-]*[ \t]*=[ \t]*[A-Za-z]"
##STRING_PATTERN = "^@[Ss][Tt][Rr][Ii][Nn][gG]{[A-Za-z][A-Za-z0-9:.+/'-]*"
STRING_PATTERN = "^@[Ss][Tt][Rr][Ii][Nn][gG]\\{[A-Za-z][A-Za-z0-9:.+/'-]*"
STRING_OFFSET = 8
STRING_PREFIX = ""
STRING_SAVE_LABEL = 1
# According to Internet RFC 1614 (May 1994), a URL is
# defined in the document T. Berners-Lee, ``Uniform
# Resource Locators'', March 1993, available at URL
# ftp://info.cern.ch/pub/ietf/url4.ps. Unfortunately,
# that address is no longer valid. However, I was able to
# track down pointers from http://www.w3.org/ to locate a
# suitable description in Internet RFC 1630 (June 1994).
# NB: We additionally disallow & in a URL because it is
# needed in SGML entities "&name;". We also disallow =
# and | because these are commonly used in \path=...= and
# \path|...| strings in BibTeX files. These restrictions
# could be removed if we went to the trouble of first
# encoding these special characters in %xy hexadecimal
# format, but they are rare enough that I am not going to
# do so for now. The worst that will happen from this
# decision is that an occasional URL in a BibTeX file will
# be missing a surrounding anchor.
URL_PATTERN = "[A-Za-z]+://[^ \",&=|]+"
URL_OFFSET = 0
URL_PREFIX = ""
URL_SAVE_LABEL = 0
# [24-May-2016] support for background coloring of block comments
IN_BLOCK_COMMENT = 0
}
# Each line receives identical processing.
{ do_line() }
END \
{
if (LASTFILENAME != UNSET_FILENAME)
end_file(LASTFILENAME)
}
function add_entry(array,value)
{
if (value in array)
array[value] = array[value] " " FNR
else
array[value] = FNR
}
function anchor(s,type,pattern,offset,prefix,save_label, name,rstart,rlength)
{
# Add anchors <A type="....">...</A> around text in s matching
# pattern. A non-zero offset discards that many characters from
# the start of the match, allowing the pattern to contain leading
# context which goes outside the anchored region. The prefix is
# attached to the start of the matched string, inside the value
# quotes in the anchor.
if (match(s,pattern))
{
rstart = RSTART # need private copies of these globals because
rlength = RLENGTH # recursion will change them
rstart += offset # adjust by offset to discard leading
rlength -= offset # context in pattern
name = substr(s,rstart,rlength)
sub(/ +at +/,"@",name) # reduce "user at host" to "user@host"
s = substr(s,1,rstart-1) \
"<A " type "=\"" prefix name "\">" \
((type == "NAME") ? "<STRONG>" : "") \
substr(s,rstart,rlength) \
((type == "NAME") ? "</STRONG>" : "") \
"</A>" \
anchor(substr(s,rstart+rlength),type,pattern,offset,prefix,save)
if (save_label)
{
if (type == "HREF")
add_entry(label_hrefs, name)
else if (type == "NAME")
add_entry(label_names, name)
}
}
return (s)
}
function begin_file( f)
{
f = output_filename(FILENAME)
## NB: If Transitional is eliminated in DOCTYPE, background coloring is lost! Why?
slash_pos = str::rindex(FILENAME, "/");
BASE_FILENAME = (slash_pos > 0)? str::substr(FILENAME, slash_pos + 1): FILENAME;
print "<!-- -*-html-*- -->" > f
print "" > f
## print "<!-- " FILENAME " -->" > f
print "<!-- " BASE_FILENAME " -->" > f
print "<!-- WARNING: Do NOT edit this file. It was converted from -->" > f
print "<!-- BibTeX format to HTML by " PROGRAM " version " VERSION " -->" > f
## print "<!-- on " DATE " -->" > f
## print "<!-- for " PERSONAL_NAME " (" USER "@" HOSTNAME ") -->" > f
print "" > f
print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/1998/REC-html40-19980424/loose.dtd\">" > f
print "" > f
print "" > f
print "<HTML>" > f
print " <HEAD>" > f
print " <META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=iso-8859-1\">"> f
print " <TITLE>" > f
## print " BibTeX bibliography " FILENAME > f
print " BibTeX bibliography " BASE_FILENAME > f
print " </TITLE>" > f
## print " <LINK REV=\"made\" HREF=\"mailto:" USER "@" HOSTNAME "\">" > f
print " <LINK HREF=\"http://www.math.utah.edu/pub/tex/bib/tugbib.css\" TYPE=\"text/css\" REL=\"stylesheet\">" > f
print " </HEAD>" > f
print "" > f
print " <BODY>" > f
print " <DIV ALIGN=\"right\">" > f
print " <A HREF=\"http://validator.w3.org/check/referer\">" > f
print " <IMG ALIGN=\"MIDDLE\" BORDER=\"0\" SRC=\"/images/valid-html40.png\" ALT=\"Valid HTML 4.0!\" HEIGHT=\"31\" WIDTH=\"88\">" > f
print " </A>" > f
print " <A HREF=\"http://jigsaw.w3.org/css-validator/check/referer\">" > f
print " <IMG ALIGN=\"MIDDLE\" BORDER=\"0\" SRC=\"/images/valid-css.gif\" ALT=\"Valid CSS!\" HEIGHT=\"31\" WIDTH=\"88\">" > f
print " </A>" > f
print " </DIV>" > f
print "<PRE>" > f
clear_array(label_names)
clear_array(label_hrefs)
}
function check_for_file_change()
{
if (LASTFILENAME != FILENAME)
{
if (LASTFILENAME != UNSET_FILENAME)
{
end_file(LASTFILENAME)
if (LASTFILENAME != "/dev/stdout")
close (output_filename(LASTFILENAME))
}
LASTFILENAME = FILENAME
begin_file()
}
}
function check_refs( label)
{
for (label in label_hrefs)
{
if (!(label in label_names))
warning("undefined label " label " at line(s) " label_hrefs[label])
}
}
function clear_array(array, key)
{
for (key in array)
delete array[key]
}
function end_file(filename, f)
{
f = output_filename(filename)
print "</PRE>" > f
print " </BODY>" > f
print "</HTML>" > f
check_refs()
}
function do_cite(s, k,n,labels,t)
{
n = split(substr(s,RSTART + CITE_OFFSET,RLENGTH - 1 - CITE_OFFSET),labels,",")
t = substr(s,1,RSTART+CITE_OFFSET-1)
for (k = 1; k <= n; ++k)
{
t = t ((k > 1) ? "," : "") "<A HREF=\"#" labels[k] "\">" labels[k] "</A>"
add_entry(label_hrefs, labels[k])
}
t = t substr(s,RSTART + RLENGTH - 1)
return (t)
}
function do_line( n,name,s)
{
s = protect_SGML_characters($0)
if (match(s,STRING_PATTERN)) # remember name from @String{name = "value"}
{
name = substr(s,RSTART + STRING_OFFSET,RLENGTH - STRING_OFFSET)
string_name[name] = 1
# print "DEBUG 1: name =", name >"/dev/stderr"
}
if (match(s,/^%+[ \t]*email[ \t]*=/)) # special handling because BibTeX does not allow @ in comments
s = anchor(s,"HREF",BIBTEX_EMAIL_PATTERN,BIBTEX_EMAIL_OFFSET,BIBTEX_EMAIL_PREFIX,\
BIBTEX_EMAIL_SAVE_LABEL)
else
s = anchor(s,"HREF",EMAIL_PATTERN,EMAIL_OFFSET,EMAIL_PREFIX,EMAIL_SAVE_LABEL)
s = anchor(s,"HREF",URL_PATTERN,URL_OFFSET,URL_PREFIX,URL_SAVE_LABEL)
s = anchor(s,"NAME",STRING_PATTERN,STRING_OFFSET,STRING_PREFIX,STRING_SAVE_LABEL)
if (match(s,CITE_PATTERN))
s = do_cite(s)
if (match(s,ENTRY_PATTERN)) # then have ``@Entry{label,''
{
n = index(s,"{")
name = substr(s,n+1)
gsub(/^[ \t]*/,"",name) # trim optional leading space
gsub(/,[ \t]*$/,"",name) # trim trailing comma and optional space
# print "DEBUG 2: name =", name >"/dev/stderr"
s = substr(s,1,n) \
"<A NAME=\"" name "\"><STRONG>" name "</STRONG></A>" \
substr(s,n+1+length(name))
add_entry(label_names, name)
}
else if (match(s,KEY_EQUALS_NAME_PATTERN)) # then have ``key = name''
{
name = substr(s,RSTART+RLENGTH-1)
sub(/,?[ \t]*$/,"",name) # trim optional trailing comma and space
# print "DEBUG 3: name =", name >"/dev/stderr"
if (name in string_name) # then we have a definition of this name
{
s = substr(s,1,RSTART+RLENGTH-2) \
"<A HREF=\"#" name "\">" name "</A>" substr(s,RSTART+RLENGTH-1+length(name))
add_entry(label_hrefs, name)
}
}
else if (match(s,CROSSREF_EQUALS_LABEL_PATTERN)) # then have `` crossref = "label"''
{
name = substr(s,RSTART+RLENGTH)
sub(/&#34;,?[ \t]*$/,"",name) # trim trailing quote and optional comma and space
# print "DEBUG 4: name =", name >"/dev/stderr"
s = substr(s,1,RSTART+RLENGTH-1) \
"<A HREF=\"#" name "\">" name "</A>" substr(s,RSTART+RLENGTH+length(name))
add_entry(label_hrefs, name)
}
check_for_file_change()
if ( (s ~ "^%") && !IN_BLOCK_COMMENT)
{
printf("</PRE><PRE CLASS=\"blockcomment\">") > output_filename(FILENAME)
IN_BLOCK_COMMENT = 1
}
else if ( (s !~ "^%") && IN_BLOCK_COMMENT)
{
printf("</PRE><PRE>") > output_filename(FILENAME)
IN_BLOCK_COMMENT = 0
}
print s >output_filename(FILENAME)
}
function output_filename(input_filename)
{
## HAWK - for use in t/h-003.hawk
if (length(T_OUT_NAME) > 0) return T_OUT_NAME;
## END HAWK
if (input_filename != _last_input_filename)
{ # optimization: we cache last function result for speed
_last_input_filename = input_filename
sub(/.[^.]*$/,"",input_filename)
if ((input_filename == "") || (input_filename == "/dev/stdin"))
_last_output_filename = "/dev/stdout"
else
_last_output_filename = PREFIX input_filename SUFFIX
}
return (_last_output_filename)
}
function protect_SGML_characters(s)
{
gsub(/&/,"\\&amp;",s) # NB: this one MUST be first
gsub(/</,"\\&lt;",s)
gsub(/>/,"\\&gt;",s)
## [24-May-2016] with the change from HTML 3.2 to 4.0, we can use &quot; again!
## gsub(/\"/,"\\&#34;",s) # this was &quot; in earlier HTML
# versions, including the HTML 3.2
# draft, but was stupidly eliminated in
# the final HTML 3.2 version: see
# http://www.w3.org/pub/WWW/MarkUp/Wilbur/
# in the section ``What happened to &quot;?''
gsub(/\"/,"\\&quot;",s)
return (s)
}
function warning(message)
{
# print FILENAME ":" FNR ":%%" message >"/dev/stderr"
#
# The only place that we need warning messages above is
# checkrefs(), after the current file has been closed, and a new
# one started, so we need LASTFILENAME instead of FILENAME here,
# and we omit the FNR, since we have no record of it for
# LASTFILENAME.
print LASTFILENAME ":%%" message >"/dev/stderr"
}