This tool contains two files:
* t2h.php - does the filtering
* t2h.sh - shell wrapper; passes arguments to t2h.php
<?php
/*
Copyright 2002 Gregory Keraunen. All Rights Reserved.
PURPOSE:
Translate text from standard input
What sets this filter apart from the many other text-to-html filters (such as txt2html.pl: http://perl-text2html.sourceforge.net) is:
* text 'markup' syntax is not obviously identifiable as such: quality of text formatting is not compromised
* includes command line options for line wrapping and indentation
markup configuration settings are read from a separate file, similar
The basic syntax rule for text file conversion is:
+ process the file one line at a time
+ look for a 'signal' after any leading white space on each line
+ signals are defined by Perl Regular Expressions (PCRE)
+ ordinarily signals apply to only the line on which they occur
+ TODO: support for more involved processing such as html tables may require start/end signals
+ if wrapping is enabled; wrapped lines include any signal that preceded the original line
+ there are two common actions taken in response to signals: WRAP line, or REPLACE line
+ ordinary tag signals cause the line to be wrapped by tags
+ some signals require unique logic so must use reserved keywords
PROBLEMS:
* Long anchor/fragment lines get split before anchor tags are written causing dramatic change in formatting after line is wrapped.
SOLUTIONS:
1) identify fragements during initial parse, prior to wrapping, prepend formatting tags to wrapped lines
2) optionally specify formatting of fragments and/or their named anchors.
CHANGE LOG:
8/20/05
moved files from /htdocs/proliberty/tools/txt2html to /htdocs/include/xmake.org/xmake/t2h
renamed script to from 'txt2html' to 't2h'
*/
///////////////
// better would be to set these by looping through:
// $T2H_PUBLIC_VARS array
$T2H_WRAP_WIDTH=(int)getenv("T2H_WRAP_WIDTH");
$T2H_INDENT= (int)getenv("T2H_INDENT");
$T2H_ANCHORS=(int)getenv("T2H_ANCHORS");
$T2H_TAB_SPACES=(int)getenv("T2H_T2H_TAB_SPACES");
$T2H_BREAK_STR=getenv("T2H_T2H_BREAK_STR");
$T2H_WRAP_STR=getenv("T2H_T2H_WRAP_STR");
$T2H_DEBUG=(int)getenv("T2H_DEBUG");
// $T2H_ANCHORS signals to create links to text fragments; target fragment must also begin with $T2H_FRAGMENT_SIGNAL
$T2H_FRAGMENT_SIGNAL=getenv("T2H_FRAGMENT_SIGNAL");
$T2H_FRAGMENT_TAG=getenv("T2H_FRAGMENT_TAG");
// $T2H_REMOVE_COMMENT_SIGNAL signals a line to ignore; don't print
$T2H_REMOVE_COMMENT_SIGNAL=getenv("T2H_REMOVE_COMMENT_SIGNAL");
$T2H_COMMENT_SIGNAL=getenv("T2H_COMMENT_SIGNAL");
$T2H_COMMENT_TAG=getenv("T2H_COMMENT_TAG");
// $T2H_HEADING_SIGNAL signals $T2H_HEADING_TAG
$T2H_HEADING_SIGNAL=getenv("T2H_HEADING_SIGNAL");
$T2H_HEADING_TAG=getenv("T2H_HEADING_TAG");
// $T2H_STRONG_SIGNAL signals $T2H_STRONG_TAG
$T2H_STRONG_SIGNAL=getenv("T2H_STRONG_SIGNAL");
$T2H_STRONG_TAG=getenv("T2H_STRONG_TAG");
$T2H_STRONG_SIGNAL_SUB=getenv("T2H_STRONG_SIGNAL_SUB");
// $T2H_SMALL_SIGNAL signals $T2H_SMALL_TAG
$T2H_SMALL_SIGNAL=getenv("T2H_SMALL_SIGNAL");
$T2H_SMALL_TAG=getenv("T2H_SMALL_TAG");
$T2H_HR_TAG=getenv("T2H_HR_TAG");
$T2H_HR_SIGNAL=getenv("T2H_HR_SIGNAL");
//trigger_error("T2H_STRONG_SIGNAL_SUB=".$T2H_STRONG_SIGNAL_SUB,E_USER_ERROR); exit (1);
/* This should be obsoleted by config file settings...
// $T2H_REMOVE_COMMENT_SIGNAL signals a line to ignore; don't print
$T2H_REMOVE_COMMENT_SIGNAL = '//# ';
$T2H_COMMENT_TAG = '!--';
// $T2H_HEADING_SIGNAL signals $T2H_HEADING_TAG
$T2H_HEADING_SIGNAL = '/// ';
$T2H_HEADING_TAG = 'h2';
// $T2H_STRONG_SIGNAL signals <b>
$T2H_STRONG_SIGNAL = '* ';
$T2H_STRONG_TAG = 'b';
// $T2H_ANCHORS signals to create links to text fragments; target fragment must also begin with $T2H_FRAGMENT_SIGNAL
$T2H_FRAGMENT_SIGNAL = '*** ';
*/
///////////////////////////////
///////////////////////////////
/* htmlFormatLines($lines)
$lines - array of lines, without trailing "\n"
*/
//////////////////////////////
function htmlFormatLines($lines){
global $T2H_ANCHORS;
global $T2H_HEADING_TAG;
global $T2H_STRONG_SIGNAL;
global $T2H_SMALL_SIGNAL;
global $T2H_HEADING_SIGNAL;
global $T2H_STRONG_TAG;
global $T2H_STRONG_SIGNAL_SUB;
global $T2H_REMOVE_COMMENT_SIGNAL;
global $T2H_COMMENT_SIGNAL;
global $T2H_COMMENT_TAG;
global $T2H_DEBUG;
global $T2H_SMALL_SIGNAL;
global $T2H_SMALL_TAG;
global $T2H_HR_TAG;
global $T2H_HR_SIGNAL;
if (!is_array($lines)){
trigger_error( 'htmlFormatLines(): $lines must be an array',E_USER_ERROR);
} else {
$numlines=count($lines);
$T2H_DEBUG && trigger_error( "htmlFormatLines(): processsing $numlines lines\n", E_USER_NOTICE);
}
if (empty($T2H_ANCHORS)){
// don't do any formatting
return $lines;
}
// must apply htmlspecialchars() before adding any markup
foreach ($lines as $i => $line){
$lines[$i] = htmlspecialchars ($lines[$i]) ;
}
// handle horizontal rule signals:
if (!empty($T2H_HR_SIGNAL)){
// $T2H_HR_SIGNAL
$pcre = '/^\s*('.preg_quote($T2H_HR_SIGNAL, '/').')\s*$/';
} else {
$pcre = '/^\s*(---+|===+)\s*$/';
}
if ( preg_match ( $pcre, $lines[0]) ){
if (!empty($T2H_HR_TAG)){
$lines[0] = preg_replace( $pcre, '<'.$T2H_HR_TAG.' class="t2h-HR"/>', $lines[0] );
// we don't set $closing_tag for <hr/>
return $lines;
} else {
return NULL;
}
}
if (!empty($T2H_REMOVE_COMMENT_SIGNAL)){
// $T2H_REMOVE_COMMENT_SIGNAL
$pcre = '/^(\s*)'.preg_quote($T2H_REMOVE_COMMENT_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$T2H_DEBUG && trigger_error( "htmlFormatLines(): removed comment line\n", E_USER_NOTICE);
return NULL;
}
}
if (!empty($T2H_COMMENT_SIGNAL)){
// $T2H_COMMENT_SIGNAL
$pcre = '/^('.preg_quote($T2H_COMMENT_SIGNAL, '/').'.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '<'.$T2H_COMMENT_TAG.' class="t2h-comment">$1', $lines[0] );
$closing_tag=$T2H_COMMENT_TAG;
}
}
if (!empty($T2H_HEADING_SIGNAL)){
// add <$T2H_HEADING_TAG>...</$T2H_HEADING_TAG> when line starts with '/// '
$pcre = '/^(\s*)'.preg_quote($T2H_HEADING_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_HEADING_TAG.' class="t2h-heading">$2', $lines[0] );
$closing_tag=$T2H_HEADING_TAG;
}
}
if (!empty($T2H_SMALL_SIGNAL)){
// add <$T2H_SMALL_TAG>...</$T2H_SMALL_TAG> when line starts with "$T2H_SMALL_SIGNAL "
$pcre = '/^(\s*)'.preg_quote($T2H_SMALL_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_SMALL_TAG.' class="t2h-small">$2', $lines[0] );
$closing_tag=$T2H_SMALL_TAG;
}
}
if (!empty($T2H_STRONG_SIGNAL)){
// add <b>...</b> when line starts with '* '
$pcre = '/^(\s*)'.preg_quote($T2H_STRONG_SIGNAL, '/').'(.*)$/';
if ( preg_match ( $pcre, $lines[0]) ){
if (!empty($T2H_STRONG_SIGNAL_SUB)){
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_STRONG_TAG.' class="t2h-strong">'.$T2H_STRONG_SIGNAL_SUB.'$2', $lines[0] );
} else {
$lines[0] = preg_replace( $pcre, '$1<'.$T2H_STRONG_TAG.'>$2', $lines[0] );
}
$closing_tag=$T2H_STRONG_TAG;
}
}
// add anchors before closing formatting tag:
foreach ($lines as $i => $line){
// this is not great: if lines have been wrapped, we won't match "hi" http://...
$pcre = '%"(.+)"[\s]+((http|https|ftp|file)://[^\s]+?)(.*)%sU';
if ( preg_match( $pcre,$lines[$i] ) ){
$lines[$i] = preg_replace( $pcre, '<a href="$2" target="_top">$1</a>$4', $lines[$i] );
} else {
$pcre = '%\b((http|https|ftp|file)://[^\s]+?)(.*)%U';
$lines[$i] = preg_replace( $pcre, '<a href="$1" target="_top">$1</a>$3', $lines[$i] );
}
// relative links
$pcre = '%\b(file:([^\s]+?))(.*)%U';
$lines[$i] = preg_replace( $pcre, '<a href="$2">$2</a>$3', $lines[$i] );
// add mailto: links
$pcre = '/\b(\w+@\w+\.[\w.]+)(.*)/';
$lines[$i] = preg_replace( $pcre, '<a href="mailto:$1">$1</a>$2', $lines[$i] );
}
if (!empty($closing_tag)){
$lines[$numlines -1] =$lines[$numlines -1]."</$closing_tag>" ;
}
return $lines;
} // htmlFormatLines()
if( $T2H_DEBUG){
trigger_error( "width=$T2H_WRAP_WIDTH\n", E_USER_NOTICE);
trigger_error( "T2H_BREAK_STR=$T2H_BREAK_STR\n", E_USER_NOTICE);
trigger_error( "T2H_INDENT=$T2H_INDENT\n", E_USER_NOTICE);
trigger_error( "T2H_ANCHORS=$T2H_ANCHORS\n", E_USER_NOTICE);
trigger_error( "T2H_TAB_SPACES=$T2H_TAB_SPACES\n", E_USER_NOTICE);
}
$tabs='';
while ( $T2H_TAB_SPACES-- ) {
$tabs .= ' ';
}
$INPUT_FILE=getenv("INPUT_FILE");
$fcontents = file ('/dev/stdin');
if( empty($fcontents)){
/*
trigger_error( "empty input: $INPUT_FILE\n", E_USER_NOTICE);
*/
echo '';
exit;
}
// we must do the wordwrapping prior to any html formatting
while (list ($line_num, $line) = each ($fcontents)) {
$line=rtrim($line);
$lineLen = strlen( $line );
if( $T2H_WRAP_WIDTH && ( $T2H_WRAP_WIDTH < $lineLen ) ){
// we will wrap lines
// preserve any formatting signal characters so wrapped lines will have same formatting as unwrapped line
$pcre = '/^(\s*)((';
$pcre .= preg_quote($T2H_STRONG_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_HEADING_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_COMMENT_SIGNAL, '/');
$pcre .='|'.preg_quote($T2H_REMOVE_COMMENT_SIGNAL, '/');
$pcre .= ')?)/';
preg_match( $pcre, $line, $matches );
$leading_space = $matches[1];
$formattingSignal = $matches[2];
$wordT2H_WRAP_STR=$T2H_WRAP_STR."\n";
if (!empty($formattingSignal)){
$conditionalTab = $tabs ;
// comment lines, preserve formatting at start of line
if ( ($formattingSignal==$T2H_COMMENT_SIGNAL) ||
($formattingSignal==$T2H_REMOVE_COMMENT_SIGNAL)
) {
$wordT2H_WRAP_STR="\n".$formattingSignal.' ';
// added indentation looks bad so we don't do it
$conditionalTab = "" ;
}
} else {
$conditionalTab = "" ;
}
// wordwrap does the line wrapping; explode removes "\n"
$wrappedLines = explode( "\n", wordwrap( $line, $T2H_WRAP_WIDTH, $wordT2H_WRAP_STR ) );
$wrappedLines = htmlFormatLines($wrappedLines);
if (!empty($wrappedLines)){
$lines[] = $wrappedLines[0]."\n";
$i=1;
while ( $i < count($wrappedLines)){
$lines[] = $leading_space.$conditionalTab.$wrappedLines[$i]."\n";
$i++;
}
} else {
$T2H_DEBUG && trigger_error( "formatted line is empty\n", E_USER_WARNING);
}
} else {
// remove trailing "\n", for consistency of input to htmlFormatLines()
$formattedLine = htmlFormatLines(array(rtrim($line)));
if (!empty($formattedLine)){
$lines[] = implode('',$formattedLine)."\n";
} else {
$T2H_DEBUG && trigger_error( "formatted line is empty\n", E_USER_WARNING);
}
}
}
while (list ($line_num, $line) = each ( $lines ) ) {
if( !empty($T2H_ANCHORS) ){
if( !empty($T2H_FRAGMENT_SIGNAL) ){
// keep track of patterns for creating links and named anchors to text fragments
$pcre = '/^\s*?'.preg_quote( $T2H_FRAGMENT_SIGNAL, '/' ).'(.+)(\s*)$/U';
if ( preg_match( $pcre, $line, $matches ) ){
$fragment_patterns[$line_num] = $matches[1];
}
}
}
$lines[$line_num] = $line;
}
if( !empty($fragment_patterns) ){
while (list ($line_num, $pattern) = each ($fragment_patterns)) {
// if $pattern is empty, we have already handled it
if ( empty($pattern) ) continue;
static $fragment_num =1;
$fragment_name = 'fragment'.$fragment_num++;
$pcre = '/(.*)('.preg_quote( $T2H_FRAGMENT_SIGNAL, '/' ).')('.preg_quote( $pattern, '/' ).')(\s*)$/U';
if ( $T2H_DEBUG ) echo "pcre=$pcre";
for ( $i=$line_num + 1; $cur_line =& $lines[$i]; $i++){
// look for a matching pattern later in the file...
if ( preg_match( $pcre, $cur_line, $matches ) ){
// if a match is found, create a link and a named anchor
// create the link with the first line with the pattern
$lines[$line_num] = preg_replace( $pcre, '$1<a href="#'.$fragment_name.'">$3</a>$4', $lines[$line_num] );
// create the named anchor for the subsequent line
$cur_line = preg_replace( $pcre, '$1<'.$T2H_FRAGMENT_TAG.' class="t2h-fragment"><a name="'.$fragment_name.'"/>$3</'.$T2H_FRAGMENT_TAG.'>$4', $cur_line );
// we don't want to process the second instance of the pattern inside
// the while loop since we've already handled it: set pattern to '':
$fragment_patterns[$i]='';
// continue with the while loop
continue 2;
}
} // for
// if we get here, no match was found inside for loop:
// only create a named anchor, no link
$lines[$line_num] = preg_replace( $pcre, '$1<'.$T2H_FRAGMENT_TAG.' class="t2h-fragment"><a name="'.$fragment_name.'"/>$3</'.$T2H_FRAGMENT_TAG.'>$4', $lines[$line_num] );
} // while
} // if
if( !empty($T2H_BREAK_STR) ){
foreach ($lines as $i=>$line){
$lines[$i] = str_replace("\n", $T2H_BREAK_STR."\n", $line );
}
}
echo join( '', $lines );
// NO EMPTY LINES BELOW HERE
?>
#!/bin/sh
#!/bin/sh -x option prints each line as it is executed; good for debugging; not documented!
# Copyright 2002 Gregory Keranen. All Rights Reserved.
# Created 5/21/2002
# PURPOSE
# convert text to html entitities
# optionally wrap input
# USAGE:
# t2h.sh [OPTIONS] [INPUT_FILE]
# QUESTIONS/ TODO:
# BUGS:
# sourcing the -f config_file will clobber environmental vars; is this right?
#==============================================================================
PHP=/usr/local/bin/php
#### DEFAULT SETTINGS:
INPUT_FILE="/dev/stdin"
OUT_FILE="/dev/stdout"
BASENAME=$(basename $0 .sh)
# this needs work:
SCRIPT_DIR=$(dirname $(ls -al `which $BASENAME`|sed -n 's/.*-> \(.*\)$/\1/p'))
TARGET_SCRIPT=${SCRIPT_DIR}/${BASENAME}.phpo
# a default config file in this script's directory
T2H_CONFIG_FILE_DEFAULT=${SCRIPT_DIR}/${BASENAME}.conf.default
# a config file in the current working directory
#T2H_CONFIG_FILE_LOCAL="$PWD/${BASENAME}.conf"
# a config file in the user's HOME directory
#T2H_CUSTOM_CONFIG_FILE=${HOME}/.t2h
VERSION="0.1a"
USAGE="Usage: $BASENAME.sh [OPTIONS] [INPUT_FILE]"
#CONFIG_READ=0
T2H_PRIVATE_VARS=(\
T2H_CONFIG_FILE_DEFAULT \
T2H_CONFIG_FILE \
)
T2H_PUBLIC_VARS=(\
T2H_WRAP_WIDTH \
T2H_BREAK_STR \
T2H_WRAP_STR \
T2H_TAB_SPACES \
T2H_INDENT \
T2H_ANCHORS \
T2H_DEBUG \
T2H_FRAGMENT_SIGNAL \
T2H_FRAGMENT_TAG \
T2H_COMMENT_SIGNAL \
T2H_COMMENT_TAG \
T2H_REMOVE_COMMENT_SIGNAL \
T2H_HEADING_SIGNAL \
T2H_HEADING_TAG \
T2H_STRONG_SIGNAL \
T2H_STRONG_TAG \
T2H_STRONG_SIGNAL_SUB \
T2H_SMALL_SIGNAL \
T2H_SMALL_TAG \
T2H_HR_TAG \
T2H_HR_SIGNAL \
)
export T2H_PUBLIC_VARS
#### END OF DEFAULT SETTINGS
# adapt this to my purpose
help()
{
cat << eof
$USAGE
Options:
-f config_file specify a config file to load
-o output_file specify an output file name
-w wrap_width wrap text to specified column width
-i if wrapping, maintain leading-space indentation
-t tab_spaces if wrapping, indent wrapped lines by tab_spaces spaces
-b break_str replace \n with break_str\n
-a disable HTML conversion: no entities or tags
-v print version
-d debug
-h print this help text
eof
}
# syntax of getopts is:
# : indicates option expects argument string after; E.g., -o output_file
# without :, option doesn't expect anything else; E.g., -h
while getopts ":df:vihao:w:b:t:" Option
do
# ;; required after each case
case $Option in
d ) T2H_DEBUG=1;echo "$BASENAME" version: "$VERSION" >&2;;
v ) echo "$BASENAME" version: "$VERSION"; exit 0;;
h ) help; exit 0;;
f ) T2H_CONFIG_FILE="$OPTARG";;
a ) T2H_ANCHORS=0;;
o ) T2H_OUT_FILE="$OPTARG";;
w ) T2H_WRAP_WIDTH="$OPTARG";;
i ) T2H_INDENT=1;;
b ) T2H_BREAK_STR="$OPTARG";;
t ) T2H_TAB_SPACES="$OPTARG";;
* ) echo "Unimplemented option chosen: $1" >&2; exit 1;;
esac
done
# this decrements the argument pointer so it points to next argument after the options.
shift $(($OPTIND - 1))
if [ ! -z "$1" ] # If input file is specified
then
if [ $(dirname "$1") = "." ]
then
INPUT_FILE=`pwd`"/$1"
else
INPUT_FILE="$1"
fi
fi
if [ ! -e "$INPUT_FILE" ] # If input file doesn't exist
then
echo "error: input file not found: \"$INPUT_FILE\"" >&2; exit 1;
fi
# source the specified config file, if it exists
if [ ! -z "$T2H_CONFIG_FILE" ]
then
if [ -f "$T2H_CONFIG_FILE" ]
then
if [[ $T2H_DEBUG = 1 ]]
then
echo "Using config file:" >&2
echo "$T2H_CONFIG_FILE" >&2
fi
. "$T2H_CONFIG_FILE"
if [ ! $? ]
then
echo "Failed reading config file:" >&2;
echo "$T2H_CONFIG_FILE" >&2; exit 1
fi
else
echo "Config file not found:" "$T2H_CONFIG_FILE" >&2; exit 1
fi
elif [ -e "$T2H_CONFIG_FILE_DEFAULT" ]
# source the default config file, if it exists
then
if [[ $T2H_DEBUG = 1 ]]
then
echo "Using config file:" >&2
echo "$T2H_CONFIG_FILE_DEFAULT" >&2
fi
. "$T2H_CONFIG_FILE_DEFAULT"
if [ ! $? ]
then
echo "Failed reading default config file" >&2;
echo "$T2H_CONFIG_FILE_DEFAULT" >&2; exit 1
fi
else
echo "Default config file not found:" >&2;
echo "$T2H_CONFIG_FILE_DEFAULT" >&2; exit 1
fi
exportVars()
{
i=0
count=${#T2H_PUBLIC_VARS[@]}
while [ "$i" -lt "$count" ]
do
export ${T2H_PUBLIC_VARS[$i]}
let i=$i+1
done
}
echoVars()
{
i=0
count=${#T2H_PUBLIC_VARS[@]}
while [ "$i" -lt "$count" ]
do
var=${T2H_PUBLIC_VARS[$i]}
value="${!var}"
echo "$var=\"$value\""
let i=$i+1
done
i=0
count=${#T2H_PRIVATE_VARS[@]}
while [ "$i" -lt "$count" ]
do
var=${T2H_PRIVATE_VARS[$i]}
value="${!var}"
echo "$var=\"$value\""
let i=$i+1
done
}
if [[ $T2H_DEBUG = 1 ]]
then
echoVars >&2
fi
exportVars
# command pipe output to > "$OUT_FILE"
cat $INPUT_FILE | ${PHP} "$TARGET_SCRIPT" 1> "$OUT_FILE"
exit 0