This tool contains two files:
* t2h.php - does the filtering
* t2h.sh - shell wrapper; passes arguments to t2h.php
<?php
/*
Copyright 2002 Gregory Keraunen. All Rights Reserved.
PURPOSE:
Translate text from standard input
What sets this filter apart from the many other text-to-html filters (such as txt2html.pl: http://perl-text2html.sourceforge.net) is:
* text 'markup' syntax is not obviously identifiable as such: quality of text formatting is not compromised
* includes command line options for line wrapping and indentation
markup configuration settings are read from a separate file, similar
The basic syntax rule for text file conversion is:
+ process the file one line at a time
+ look for a 'signal' after any leading white space on each line
+ signals are defined by Perl Regular Expressions (PCRE)
+ ordinarily signals apply to only the line on which they occur
+ TODO: support for more involved processing such as html tables may require start/end signals
+ if wrapping is enabled; wrapped lines include any signal that preceded the original line
+ there are two common actions taken in response to signals: WRAP line, or REPLACE line
+ ordinary tag signals cause the line to be wrapped by tags
+ some signals require unique logic so must use reserved keywords
PROBLEMS:
* Long anchor/fragment lines get split before anchor tags are written causing dramatic change in formatting after line is wrapped.
SOLUTIONS:
1) identify fragements during initial parse, prior to wrapping, prepend formatting tags to wrapped lines
2) optionally specify formatting of fragments and/or their named anchors.
CHANGE LOG:
8/20/05
moved files from /htdocs/proliberty/tools/txt2html to /htdocs/include/xmake.org/xmake/t2h
renamed script to from 'txt2html' to 't2h'
*/
///////////////
// better would be to set these by looping through:
// $T2H_PUBLIC_VARS array
$T2H_WRAP_WIDTH=(int)getenv("T2H_WRAP_WIDTH");
$T2H_INDENT= (int)getenv("T2H_INDENT");
$T2H_ANCHORS=(int)getenv("T2H_ANCHORS");
$T2H_TAB_SPACES=(int)getenv("T2H_T2H_TAB_SPACES");
$T2H_BREAK_STR=getenv("T2H_T2H_BREAK_STR");
$T2H_WRAP_STR=getenv("T2H_T2H_WRAP_STR");
$T2H_DEBUG=(int)getenv("T2H_DEBUG");
// $T2H_ANCHORS signals to create links to text fragments; target fragment must also begin with $T2H_FRAGMENT_SIGNAL
$T2H_FRAGMENT_SIGNAL=getenv("T2H_FRAGMENT_SIGNAL");
$T2H_FRAGMENT_TAG=getenv("T2H_FRAGMENT_TAG");
// $T2H_REMOVE_COMMENT_SIGNAL signals a line to ignore; don't print
$T2H_REMOVE_COMMENT_SIGNAL=getenv("T2H_REMOVE_COMMENT_SIGNAL");
$T2H_COMMENT_SIGNAL=getenv("T2H_COMMENT_SIGNAL");
$T2H_COMMENT_TAG=getenv("T2H_COMMENT_TAG");
// $T2H_HEADING_SIGNAL signals $T2H_HEADING_TAG
$T2H_HEADING_SIGNAL=getenv("T2H_HEADING_SIGNAL");
$T2H_HEADING_TAG=getenv("T2H_HEADING_TAG");
// $T2H_STRONG_SIGNAL signals $T2H_STRONG_TAG
$T2H_STRONG_SIGNAL=getenv("T2H_STRONG_SIGNAL");
$T2H_STRONG_TAG=getenv("T2H_STRONG_TAG");
$T2H_STRONG_SIGNAL_SUB=getenv("T2H_STRONG_SIGNAL_SUB");
// $T2H_SMALL_SIGNAL signals $T2H_SMALL_TAG
$T2H_SMALL_SIGNAL=getenv("T2H_SMALL_SIGNAL");
$T2H_SMALL_TAG=getenv("T2H_SMALL_TAG");
$T2H_HR_TAG=getenv("T2H_HR_TAG");
$T2H_HR_SIGNAL=getenv("T2H_HR_SIGNAL");
//trigger_error("T2H_STRONG_SIGNAL_SUB=".$T2H_STRONG_SIGNAL_SUB,E_USER_ERROR); exit (1);
/* This should be obsoleted by config file settings...
// $T2H_REMOVE_COMMENT_SIGNAL signals a line to ignore; don't print
$T2H_REMOVE_COMMENT_SIGNAL = '//# ';
$T2H_COMMENT_TAG = '!--';
// $T2H_HEADING_SIGNAL signals $T2H_HEADING_TAG
$T2H_HEADING_SIGNAL = '/// ';
$T2H_HEADING_TAG = 'h2';
// $T2H_STRONG_SIGNAL signals <b>
$T2H_STRONG_SIGNAL = '* ';
$T2H_STRONG_TAG = 'b';
// $T2H_ANCHORS signals to create links to text fragments; target fragment must also begin with $T2H_FRAGMENT_SIGNAL
$T2H_FRAGMENT_SIGNAL = '*** ';
*/
///////////////////////////////
///////////////////////////////
/* htmlFormatLines($lines)
$lines - array of lines, without trailing "\n"
*/
//////////////////////////////
function htmlFormatLines($lines){
global $T2H_ANCHORS;
global $T2H_HEADING_TAG;
global $T2H_STRONG_SIGNAL;
global $T2H_SMALL_SIGNAL;
global $T2H_HEADING_SIGNAL;
global $T2H_STRONG_TAG;
global $T2H_STRONG_SIGNAL_SUB;
global $T2H_REMOVE_COMMENT_SIGNAL;
global $T2H_COMMENT_SIGNAL;
global $T2H_COMMENT_TAG;
global $T2H_DEBUG;
global $T2H_SMALL_SIGNAL;
global $T2H_SMALL_TAG;
global $T2H_HR_TAG;
global $T2H_HR_SIGNAL;
if (!is_array($lines)){
trigger_error( 'htmlFormatLines(): $lines must be an array',E_USER_ERROR);
} else {
$numlines=count($lines);
$T2H_DEBUG && trigger_error( "htmlFormatLines(): processsing $numlines lines\n", E_USER_NOTICE);
}
if (empty($T2H_ANCHORS)){
// don't do any formatting
return $lines;
}
// must apply htmlspecialchars() before adding any markup
foreach ($lines as $i => $line){
$lines[$i] = htmlspecialchars ($lines[$i]) ;
}
// handle horizontal rule signals:
if (!empty($T2H_HR_SIGNAL)){
// $T2H_HR_SIGNAL
$pcre = '/^\s*('.preg_quote($T2H_HR_SIGNAL, '/').')\s*$/';
} else {
$pcre = '/^\s*(---+|===+)\s*$/';
}
if ( preg_match ( $pcre, $lines[0]) ){
if (!empty($T2H_HR_TAG)){
$lines[0] = preg_replace( $pcre, '<'.$T2H_HR_TAG.' class="t2h-HR"/>', $lines[0] );
// we don't set $closing_tag for <hr/>
return $lines;
} else {
return NULL;
}
}
if (!empty($T2H_REMOVE_COMMENT_SIGNAL)){
// $T2H_REMOVE_COMMENT_SIGNAL
$pcre = '/^(\s*)'.preg_quote($T2H_REMOVE_COMMENT_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$T2H_DEBUG && trigger_error( "htmlFormatLines(): removed comment line\n", E_USER_NOTICE);
return NULL;
}
}
if (!empty($T2H_COMMENT_SIGNAL)){
// $T2H_COMMENT_SIGNAL
$pcre = '/^('.preg_quote($T2H_COMMENT_SIGNAL, '/').'.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '<'.$T2H_COMMENT_TAG.' class="t2h-comment">$1', $lines[0] );
$closing_tag=$T2H_COMMENT_TAG;
}
}
if (!empty($T2H_HEADING_SIGNAL)){
// add <$T2H_HEADING_TAG>...</$T2H_HEADING_TAG> when line starts with '/// '
$pcre = '/^(\s*)'.preg_quote($T2H_HEADING_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_HEADING_TAG.' class="t2h-heading">$2', $lines[0] );
$closing_tag=$T2H_HEADING_TAG;
}
}
if (!empty($T2H_SMALL_SIGNAL)){
// add <$T2H_SMALL_TAG>...</$T2H_SMALL_TAG> when line starts with "$T2H_SMALL_SIGNAL "
$pcre = '/^(\s*)'.preg_quote($T2H_SMALL_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_SMALL_TAG.' class="t2h-small">$2', $lines[0] );
$closing_tag=$T2H_SMALL_TAG;
}
}
if (!empty($T2H_STRONG_SIGNAL)){
// add <b>...</b> when line starts with '* '
$pcre = '/^(\s*)'.preg_quote($T2H_STRONG_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
if (!empty($T2H_STRONG_SIGNAL_SUB)){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_STRONG_TAG.' class="t2h-strong">'.$T2H_STRONG_SIGNAL_SUB.'$2', $lines[0] );
} else {
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_STRONG_TAG.'>$2', $lines[0] );
}
$closing_tag=$T2H_STRONG_TAG;
}
}
// add anchors before closing formatting tag:
foreach ($lines as $i => $line){
// this is not great: if lines have been wrapped, we won't match "hi" http://...
$pcre = '%"(.+)"[\s]+((http|https|ftp|file)://[^\s]+?)(.*)%sU';
if ( preg_match( $pcre,$lines[$i] ) ){
$lines[$i] = preg_replace( $pcre, '<a href="$2" target="_top">$1</a>$4', $lines[$i] );
} else {
$pcre = '%\b((http|https|ftp|file)://[^\s]+?)(.*)%U';
$lines[$i] = preg_replace( $pcre, '<a href="$1" target="_top">$1</a>$3', $lines[$i] );
}
// relative links
$pcre = '%\b(file:([^\s]+?))(.*)%U';
$lines[$i] = preg_replace( $pcre, '<a href="$2">$2</a>$3', $lines[$i] );
// add mailto: links
$pcre = '/\b(\w+@\w+\.[\w.]+)(.*)/';
$lines[$i] = preg_replace( $pcre, '<a href="mailto:$1">$1</a>$2', $lines[$i] );
}
if (!empty($closing_tag)){
$lines[$numlines -1] =$lines[$numlines -1]."</$closing_tag>" ;
}
return $lines;
} // htmlFormatLines()
if( $T2H_DEBUG){
trigger_error( "width=$T2H_WRAP_WIDTH\n", E_USER_NOTICE);
trigger_error( "T2H_BREAK_STR=$T2H_BREAK_STR\n", E_USER_NOTICE);
trigger_error( "T2H_INDENT=$T2H_INDENT\n", E_USER_NOTICE);
trigger_error( "T2H_ANCHORS=$T2H_ANCHORS\n", E_USER_NOTICE);
trigger_error( "T2H_TAB_SPACES=$T2H_TAB_SPACES\n", E_USER_NOTICE);
}
$tabs='';
while ( $T2H_TAB_SPACES-- ) {
$tabs .= ' ';
}
$INPUT_FILE=getenv("INPUT_FILE");
$fcontents = file ('/dev/stdin');
if( empty($fcontents)){
/*
trigger_error( "empty input: $INPUT_FILE\n", E_USER_NOTICE);
*/
echo '';
exit;
}
// we must do the wordwrapping prior to any html formatting
while (list ($line_num, $line) = each ($fcontents)) {
$line=rtrim($line);
$lineLen = strlen( $line );
if( $T2H_WRAP_WIDTH && ( $T2H_WRAP_WIDTH < $lineLen ) ){
// we will wrap lines
// preserve any formatting signal characters so wrapped lines will have same formatting as unwrapped line
$pcre = '/^(\s*)((';
$pcre .= preg_quote($T2H_STRONG_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_HEADING_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_COMMENT_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_REMOVE_COMMENT_SIGNAL, '/');
$pcre .= ')?)/';
preg_match( $pcre, $line, $matches );
$leading_space = $matches[1];
$formattingSignal = $matches[2];
$wordT2H_WRAP_STR=$T2H_WRAP_STR."\n";
if (!empty($formattingSignal)){
$conditionalTab = $tabs ;
// comment lines, preserve formatting at start of line
if ( ($formattingSignal==$T2H_COMMENT_SIGNAL) ||
($formattingSignal==$T2H_REMOVE_COMMENT_SIGNAL)
) {
$wordT2H_WRAP_STR="\n".$formattingSignal.' ';
// added indentation looks bad so we don't do it
$conditionalTab = "" ;
}
} else {
$conditionalTab = "" ;
}
// wordwrap does the line wrapping; explode removes "\n"
$wrappedLines = explode( "\n", wordwrap( $line, $T2H_WRAP_WIDTH, $wordT2H_WRAP_STR ) );
$wrappedLines = htmlFormatLines($wrappedLines);
if (!empty($wrappedLines)){
$lines[] = $wrappedLines[0]."\n";
$i=1;
while ( $i < count($wrappedLines)){
$lines[] = $leading_space.$conditionalTab.$wrappedLines[$i]."\n";
$i++;
}
} else {
$T2H_DEBUG && trigger_error( "formatted line is empty\n", E_USER_WARNING);
}
} else {
// remove trailing "\n", for consistency of input to htmlFormatLines()
$formattedLine = htmlFormatLines(array(rtrim($line)));
if (!empty($formattedLine)){
$lines[] = implode('',$formattedLine)."\n";
} else {
$T2H_DEBUG && trigger_error( "formatted line is empty\n", E_USER_WARNING);
}
}
}
while (list ($line_num, $line) = each ( $lines ) ) {
if( !empty($T2H_ANCHORS) ){
if( !empty($T2H_FRAGMENT_SIGNAL) ){
// keep track of patterns for creating links and named anchors to text fragments
$pcre = '/^\s*?'.preg_quote( $T2H_FRAGMENT_SIGNAL, '/' ).'(.+)(\s*)$/U';
if ( preg_match( $pcre, $line, $matches ) ){
$fragment_patterns[$line_num] = $matches[1];
}
}
}
$lines[$line_num] = $line;
}
if( !empty($fragment_patterns) ){
while (list ($line_num, $pattern) = each ($fragment_patterns)) {
// if $pattern is empty, we have already handled it
if ( empty($pattern) ) continue;
static $fragment_num =1;
$fragment_name = 'fragment'.$fragment_num++;
$pcre = '/(.*)('.preg_quote( $T2H_FRAGMENT_SIGNAL, '/' ).')('.preg_quote( $pattern, '/' ).')(\s*)$/U';
if ( $T2H_DEBUG ) echo "pcre=$pcre";
for ( $i=$line_num + 1; $cur_line =& $lines[$i]; $i++){
// look for a matching pattern later in the file...
if ( preg_match( $pcre, $cur_line, $matches ) ){
// if a match is found, create a link and a named anchor
// create the link with the first line with the pattern
$lines[$line_num] = preg_replace( $pcre, '$1<a href="#'.$fragment_name.'">$3</a>$4', $lines[$line_num] );
// create the named anchor for the subsequent line
$cur_line = preg_replace( $pcre, '$1<'.$T2H_FRAGMENT_TAG.' class="t2h-fragment"><a name="'.$fragment_name.'"/>$3</'.$T2H_FRAGMENT_TAG.'>$4', $cur_line );
// we don't want to process the second instance of the pattern inside
// the while loop since we've already handled it: set pattern to '':
$fragment_patterns[$i]='';
// continue with the while loop
continue 2;
}
} // for
// if we get here, no match was found inside for loop:
// only create a named anchor, no link
$lines[$line_num] = preg_replace( $pcre, '$1<'.$T2H_FRAGMENT_TAG.' class="t2h-fragment"><a name="'.$fragment_name.'"/>$3</'.$T2H_FRAGMENT_TAG.'>$4', $lines[$line_num] );
} // while
} // if
if( !empty($T2H_BREAK_STR) ){
foreach ($lines as $i=>$line){
$lines[$i] = str_replace("\n", $T2H_BREAK_STR."\n", $line );
}
}
echo join( '', $lines );
// NO EMPTY LINES BELOW HERE
?>
#!/bin/sh #!/bin/sh -x option prints each line as it is executed; good for debugging; not documented! # Copyright 2002 Gregory Keranen. All Rights Reserved. # Created 5/21/2002 # PURPOSE # convert text to html entitities # optionally wrap input # USAGE: # t2h.sh [OPTIONS] [INPUT_FILE] # QUESTIONS/ TODO: # BUGS: # sourcing the -f config_file will clobber environmental vars; is this right? #============================================================================== PHP=/usr/local/bin/php #### DEFAULT SETTINGS: INPUT_FILE="/dev/stdin" OUT_FILE="/dev/stdout" BASENAME=$(basename $0 .sh) # this needs work: SCRIPT_DIR=$(dirname $(ls -al `which $BASENAME`|sed -n 's/.*-> \(.*\)$/\1/p')) TARGET_SCRIPT=${SCRIPT_DIR}/${BASENAME}.phpo # a default config file in this script's directory T2H_CONFIG_FILE_DEFAULT=${SCRIPT_DIR}/${BASENAME}.conf.default # a config file in the current working directory #T2H_CONFIG_FILE_LOCAL="$PWD/${BASENAME}.conf" # a config file in the user's HOME directory #T2H_CUSTOM_CONFIG_FILE=${HOME}/.t2h VERSION="0.1a" USAGE="Usage: $BASENAME.sh [OPTIONS] [INPUT_FILE]" #CONFIG_READ=0 T2H_PRIVATE_VARS=(\ T2H_CONFIG_FILE_DEFAULT \ T2H_CONFIG_FILE \ ) T2H_PUBLIC_VARS=(\ T2H_WRAP_WIDTH \ T2H_BREAK_STR \ T2H_WRAP_STR \ T2H_TAB_SPACES \ T2H_INDENT \ T2H_ANCHORS \ T2H_DEBUG \ T2H_FRAGMENT_SIGNAL \ T2H_FRAGMENT_TAG \ T2H_COMMENT_SIGNAL \ T2H_COMMENT_TAG \ T2H_REMOVE_COMMENT_SIGNAL \ T2H_HEADING_SIGNAL \ T2H_HEADING_TAG \ T2H_STRONG_SIGNAL \ T2H_STRONG_TAG \ T2H_STRONG_SIGNAL_SUB \ T2H_SMALL_SIGNAL \ T2H_SMALL_TAG \ T2H_HR_TAG \ T2H_HR_SIGNAL \ ) export T2H_PUBLIC_VARS #### END OF DEFAULT SETTINGS # adapt this to my purpose help() { cat << eof $USAGE Options: -f config_file specify a config file to load -o output_file specify an output file name -w wrap_width wrap text to specified column width -i if wrapping, maintain leading-space indentation -t tab_spaces if wrapping, indent wrapped lines by tab_spaces spaces -b break_str replace \n with break_str\n -a disable HTML conversion: no entities or tags -v print version -d debug -h print this help text eof } # syntax of getopts is: # : indicates option expects argument string after; E.g., -o output_file # without :, option doesn't expect anything else; E.g., -h while getopts ":df:vihao:w:b:t:" Option do # ;; required after each case case $Option in d ) T2H_DEBUG=1;echo "$BASENAME" version: "$VERSION" >&2;; v ) echo "$BASENAME" version: "$VERSION"; exit 0;; h ) help; exit 0;; f ) T2H_CONFIG_FILE="$OPTARG";; a ) T2H_ANCHORS=0;; o ) T2H_OUT_FILE="$OPTARG";; w ) T2H_WRAP_WIDTH="$OPTARG";; i ) T2H_INDENT=1;; b ) T2H_BREAK_STR="$OPTARG";; t ) T2H_TAB_SPACES="$OPTARG";; * ) echo "Unimplemented option chosen: $1" >&2; exit 1;; esac done # this decrements the argument pointer so it points to next argument after the options. shift $(($OPTIND - 1)) if [ ! -z "$1" ] # If input file is specified then if [ $(dirname "$1") = "." ] then INPUT_FILE=`pwd`"/$1" else INPUT_FILE="$1" fi fi if [ ! -e "$INPUT_FILE" ] # If input file doesn't exist then echo "error: input file not found: \"$INPUT_FILE\"" >&2; exit 1; fi # source the specified config file, if it exists if [ ! -z "$T2H_CONFIG_FILE" ] then if [ -f "$T2H_CONFIG_FILE" ] then if [[ $T2H_DEBUG = 1 ]] then echo "Using config file:" >&2 echo "$T2H_CONFIG_FILE" >&2 fi . "$T2H_CONFIG_FILE" if [ ! $? ] then echo "Failed reading config file:" >&2; echo "$T2H_CONFIG_FILE" >&2; exit 1 fi else echo "Config file not found:" "$T2H_CONFIG_FILE" >&2; exit 1 fi elif [ -e "$T2H_CONFIG_FILE_DEFAULT" ] # source the default config file, if it exists then if [[ $T2H_DEBUG = 1 ]] then echo "Using config file:" >&2 echo "$T2H_CONFIG_FILE_DEFAULT" >&2 fi . "$T2H_CONFIG_FILE_DEFAULT" if [ ! $? ] then echo "Failed reading default config file" >&2; echo "$T2H_CONFIG_FILE_DEFAULT" >&2; exit 1 fi else echo "Default config file not found:" >&2; echo "$T2H_CONFIG_FILE_DEFAULT" >&2; exit 1 fi exportVars() { i=0 count=${#T2H_PUBLIC_VARS[@]} while [ "$i" -lt "$count" ] do export ${T2H_PUBLIC_VARS[$i]} let i=$i+1 done } echoVars() { i=0 count=${#T2H_PUBLIC_VARS[@]} while [ "$i" -lt "$count" ] do var=${T2H_PUBLIC_VARS[$i]} value="${!var}" echo "$var=\"$value\"" let i=$i+1 done i=0 count=${#T2H_PRIVATE_VARS[@]} while [ "$i" -lt "$count" ] do var=${T2H_PRIVATE_VARS[$i]} value="${!var}" echo "$var=\"$value\"" let i=$i+1 done } if [[ $T2H_DEBUG = 1 ]] then echoVars >&2 fi exportVars # command pipe output to > "$OUT_FILE" cat $INPUT_FILE | ${PHP} "$TARGET_SCRIPT" 1> "$OUT_FILE" exit 0