#!/bin/bash

# Copyright (C) 2009-2010 Nokia Corporation. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

set -e
set -u

set +x

# parameter defaults
inpath=""
outpath=""
maxPageSize=400 # split pages larger than this (kB)

function help_func () 
{
    cat << end_of_help_text
     
USAGE: 
  ./maemodoc-wikify     --in <input dir or file>  --out <output dir or file>  
                        [ --pagesize <size in kilobytes> ]  [ --help ] 
       
Converts a set of html files to wiki format. The script uses html2wiki as a 
work horse, but makes a large amount of customized fixes to the results. Also, 
any oversized wiki files are automatically split into parts.

OPTIONS
----------
--in <path>:  Normally <path> points to a directory and the script then looks 
              in that folder and converts all html files it
              can find and copies jpg and png images to output folder. <path> 
              can also point to a single html file in which case only that file 
              is converted (--out should point to a file in this case), and no 
              images are copied to output folder. Default: "${inpath}"

--out <path>: Output is saved in this directory. If it doesn't exist, it will 
              be created. If input is a single html file, <path> is the name of 
              the output file. Default: "${outpath}".

--pagesize <size>: Split wiki files that are larger than <size> kilobytes. Note 
                   that this automatic splitting does not guarantee an optimal 
                   split point. Default: "${maxPageSize}".

Examples:
---------
1) Converting a set of html files to wiki format in maemo docs baseline:
./maemodoc-wikify --in releases/html/Developer_Guide --out releases/wiki

2) Converting a single file:
./maemodoc-wikify test.html test.wiki

Conversion notes:
-----------------
This is not a generic html->wiki converter, but only meant for html generated 
from LaTeX in maemo documentation baseline. Some limitations:
  -only allowed wikilinks point to the same page (chapter), there must be a 
   section label \label{sec:something} in the original latex file, and 
   latex2html must have been called with '-show-section-numbers'.
  -lists, especially numbered, with code boxes are broken
  -this script automatically splits oversized wiki pages. Check the result
   yourself, and move stuff around if you do not like the splitting point.
  -if there are input code snippets (\\lstinputlisting) in the original latex 
   files, html should be generated without '--highlight' to latex2html to get 
   cleaner wiki code. When converted to wiki, these snippets have <nowiki> or 
   <code> tags that are usually not necessary, and could be manually removed. 
   They are only needed if there is some potential wiki code in the snippet. 
   The result of these tags is an extra space on each line of the 
   snippet. 
  -sometimes the first lines of paragraphs are indented by one space, 
   creating an unintended code block. This is probably a bug in html2wiki. Does 
   not seem to happen if all text is encloded in <p> tags.
 
end_of_help_text
}

# Partially converts its parameter string $1 to a regex
# The result is stored in global variable retval.
# currently converts \ [ ] / . &
function to_regex() {
    retval=$( echo ${1} | sed 's/\\/\\\\/g
                               s/\[/\\\[/g 
                               s/\]/\\\]/g 
                               s/\./\\\./g
                               s/\//\\\//g
                               s/\&/\\\&/g' )
}


# computes the size of the file given as param $1 in kilobytes
# returns the result in global variable $retval
function get_file_size() {
    retval=$( du --apparent-size -sk "$1" | sed -e "s/^\([0-9]*\).*/\1/" )
}


# auxiliary func of wikify_file; converts tags to wiki format
# See usage examples in wikify_file().
# Params:
#   $1: name of the wiki file to be operated on
#   $2: opening html tag as regexp 
#   $3: closing html tag as regexp, e.g., <\/span>
#   $4: wiki open tag equivalent
#   $5: wiki close tag (if omitted, wiki close tag=wiki open tag)
function convert_tag() {
    local wikiFile=${1}
    local tag=${2}
    local closeTag=${3}
    local wikiTag=${4}
    local wikiCloseTag=${wikiTag}
    ( test ${#} -eq 5 ) && wikiCloseTag=${5}

    sed -i "
  : top
    /${tag}/ ! b
    s/${tag}/${wikiTag}\n/
    /\n.*${closeTag}/ ! {
      s/\n//
      : loop
        n
        /${closeTag}/ ! b loop
    }
    s/${closeTag}\([^\n]*\)$/${wikiCloseTag}\1/
    s/\n//
    b top" ${wikiFile};
}


# converts a html file to wiki format
# Params:
#   $1: name of the input html file
#   $2: name of the output wiki file
#   $3: maximum wiki page size in kb 
function wikify_file() {
    local file=${1}                         # html file to be converted 
    local wikiFile=$2                       # wiki file with path
    local rWikiFile=$( basename $wikiFile ) # name of the wiki file without path
    local htmldir=$( dirname $1 )           # directory of the html file    
    local maxPageSize=$3
    
    tmpfile=$( mktemp -t $( basename "${file}.XXXXXX" ) )

    cp ${file} ${tmpfile}

    # Replace the sup-small mess with an ascii representation of the LaTeX logo
    sed -ri 's/<[Ss][Pp][Aa][Nn] [Cc][Ll][Aa][Ss][Ss]="logo,([^>]+)">.*<\/[Ss][Po][Aa][Nn]>/\1 /' ${tmpfile}

    # add newline markers in boxes
    sed -rni '
      : seek_box_start
        /<div class="(gray|note)box">/ {
          : seek_box_end
            /<\/div>/ {
              s/\n/!NEWLINE_IN_BOX!/g
              b end
            }
            $ b end
            N
            b seek_box_end
        }
        p
        $ b end
        n
        b seek_box_start
      : end
        N
        p' ${tmpfile}


    sed -i 's/{/<nowiki>{/g' ${tmpfile}
    sed -i 's/}/}<\/nowiki>/g' ${tmpfile}
 
    # do the wiki conversion
    html2wiki --dialect=MediaWiki --base-uri=localhost \
	--wiki-uri=http://wiki.maemo.org/ ${tmpfile} > ${wikiFile};

    # Fix LaTEX in lists
    sed -ri 's/(^[*]* [[].*)L<sup>A<\/sup>TEX( )?/\1LaTeX /g' ${wikiFile}

    # remove arabic spans
    convert_tag ${wikiFile} '<span class="arabic">' '<\/span>' ''

    convert_tag ${wikiFile} '<div align="CENTER">' '<\/div>' ''    
    
    # transform textit to MediaWiki italics
    convert_tag ${wikiFile} '<span class="textit">' '<\/span>' "\'\'"
    
    # transform textbf to MediaWiki bold
    convert_tag ${wikiFile} '<span class="textbf">' '<\/span>' "\'\'\'"
    
    # remove <br /> tags
    sed -i 's/<br \/>//g' ${wikiFile}
        
    # remove indents (these should only be generated by grayboxes)
    sed -ri 's/^( )*//' ${wikiFile}

    # multiline code -> wiki code box
    sed -rni '
      : seek_code_start
        /<code><nowiki>/ {
          /<\/code>/ b end
          s/<nowiki>/<nowiki>\n/
          s/^/ /
          : seek_code_end
            /<\/code>/ {
              s/\n/\n /g
              s/<\/nowiki>//
              s/<\/code>/\n <\/nowiki><\/code>/
              b end
            }
            $ b end
            N
            b seek_code_end
        }
        p
        $ b end
        n
        b seek_code_start
      : end
        N
        p' ${wikiFile}


    # graybox -> wiki code box
    sed -rni '
      : seek_box_start
        /<div class="graybox">/ {
          : seek_box_end
            /<\/div>/ {
              s/!NEWLINE_IN_BOX!/\n /g
              s/<div class="graybox">//
              s/<\/div>//
              s/^/ /
              b end
            }
            $ b end
            N
            b seek_box_end
        }
        p
        $ b end
        n
        b seek_box_start
      : end
        N
        p' ${wikiFile}

    NB_REPL_BEG="{|\n|-\n| [[Image:dialog-information.png]]\n| '''Note:''' "
    NB_REPL_END="\n|}"

    if ( test ${USE_TEMPLATE_NOTE} == ${YES} ) ; then
	NB_REPL_BEG="{{note}}"
	NB_REPL_END=""
    fi

    # notebox -> 
    sed -rni "
      : seek_box_start
        /<div class=\"notebox\">/ {
          : seek_box_end
            /<\/div>/ {
              s/!NEWLINE_IN_BOX!//
              s/N\.B\.//
              s/<div class=\"notebox\">/${NB_REPL_BEG}/
              s/!NEWLINE_IN_BOX!/<br>/g
              s/<\/div>/${NB_REPL_END}/
              s/\n( )*/\n/g
              b end
            }
            $ b end
            N
            b seek_box_end
        }
        p
        $ b end
        n
        b seek_box_start
      : end
        N
        p" ${wikiFile}


    # remove trailing spaces
    sed -ri 's/( )*$//' ${wikiFile}

    # fix broken nested tables
    sed -i 's/\([^^]\){|/\1\n{|/g' ${wikiFile}
    
    # replace [/node12.html#possiblestuff 14.5 Guide] with [[{{PAGENAME}}/Guide]] 
    sed -ri 's/(^[*]+ )\[\/node[0-9]*\.html[^ ]* ([0-9.]* )?(.*)\]/\1\[\[{{PAGENAME}}\/\3\|\3\]\]/g' ${wikiFile}
    
    # Trim page heads
    if ( test $( grep -Ec '^----[:blank:]*$' ${wikiFile} ) -eq 1 ) ; then
	sed -i 's/^----[:blank:]$//' ${wikiFile}
    elif ( test $( grep -Ec '^----[:blank:]*$' ${wikiFile} ) -gt 1 ) &&
	( ! echo ${wikiFile} | grep -q "index" ) &&
	( ! grep -Eq "=( )*Bibliography( )*=" ${wikiFile} ) ; then	
	sed -rni '
          : seek_marker
            /^----/ b found_marker
            $ b
            n
            b seek_marker

          : found_marker
            $ b
            n
            /^----/ q
            p
            b found_marker' ${wikiFile}
    fi
    
    # Fix for those broken wikilinks that point to the same page and are of
    # form [localhost#sec:flashing_kernel 19.3.1] (i.e. mandatory sec:xxx label 
    # in latex) and where a corresponding header =Flashing Kernel= can be found.
    # Will not work if the assumptions do not hold.
    while grep -q "\[localhost#sec:[^]]*[0-9\.]\+\]" $wikiFile ; do

        # first broken link: grep finds the link and 
        # tail removes possible multiple results on one row, leaving 
        # just the first
	local link=$( grep -m 1 -o "\[localhost#sec:[^]]*[0-9\.]\+\]" \
	    $wikiFile | tail -n 1 )

        # echo "local link found: $link"
        # section number in the link in regexp form, e.g., 19\.3\.1
	local secNumber=$( echo $link | grep -o "[0-9\.]\+\]" \
	    | sed -e 's/\]//' -e 's/\./\\\./g' )

        # name of the section, e.g., Flashing Kernel
	local secName=$( grep -m 1 -o "=\+ *${secNumber}.*=\+" ${wikiFile} | \
	    sed "s/=\+ *${secNumber}\.\? *\(.*[^ =]\) *=\+/\1/" )
	
        # echo "nimi: $secName, linkki: $link, numero: $secNumber"
        # substitution: [localhost#sec:flashing_kernel 19.3.1] -> 
        # [[#Flashing Kernel]]
	if [[ -n $secName ]] ; then
	    sed -i \
		"s/\[localhost#sec:[^]]*${secNumber}\]/\[\[#${secName}\]\]/g" \
		$wikiFile
	else # if there is a problem, break to avoid an endless loop
	    break
	fi
    done
    
    # fix links to bibligraphic references, e.g., [[/node23.html#maemosdks 65]]
    # This generates a local (per page) list of references as needed.
    local refsFlag=0
    while grep -q "\[\[\/node[0-9]\+\.html#[^]]*[0-9]\+\]\]" \
	$wikiFile ; do # find refs

	local ref=$( grep -m 1 -o "\[\[\/node[0-9]\+\.html#[^]]*[0-9]\+\]\]" \
	    ${wikiFile} | tail -n 1 )
	local refNum=$( echo ${ref} | grep -o "[0-9]\+\]\]" \
	    | sed -e 's/\]\]//' -e 's/\./\\\./g' )
	local refHtml=$( echo ${ref} \
	    | sed -e 's/\[\[\/\(node[0-9]\+\.html\).*/\1/' )
	
	if [[ ! -e $htmldir/$refHtml ]] ; then
	    echo "Bibliography file $htmldir/$refHtml not found! Aborting."
	    exit 1
	fi

	local refInHtml=$( grep -A 10 "<DT><A NAME=[^>]*>$refNum<\/A>" \
	    $htmldir/$refHtml | grep -B 10 -m 1 "^$" )
	
	local refTitle=$( echo "$refInHtml" \
	    | grep -A 10 "<DD>" | grep -B 10 "<TT>" \
	    | sed 's/<BR>//g' \
	    | awk 'BEGIN { ORS=" " } ; { print $0 }' \
	    | sed -e 's/.*<DD>\(.*\)<TT>.*/\1/' )

        #local refTitle=$( echo "$refInHtml"|tail -n 3|head -n 1 )
	local refLink=$( echo "$refInHtml" \
	    | grep -o -m 1 'https\?:\/\/[^<>"]*' | head -n 1 ) 

       #echo -e "REFS: $ref, num=$refNum, refTitle=${refTitle}, link=$refLink\n"
	to_regex "$ref" && ref="$retval"
	to_regex "$refTitle" && refTitle="$retval"
	to_regex "$refLink" && refLink="$retval"
	
       #echo -e "REFS as regex: $ref, num=$refNum, refTitle=$refTitle, link=$refLink\n"
	
        # replace the first occurrence
	local lineNumber=$( grep -m 1 -n "${ref}" $wikiFile \
	    | awk 'BEGIN {RS=":"}; {print $1}'|head -n 1 )
	sed -i "${lineNumber} s/${ref}/<ref name=r${refNum}>${refTitle} ${refLink}<\/ref>/" $wikiFile
	
        # replace other occurrences (to avoid duplicate refs)
	sed -i "s/${ref}/<ref name=r${refNum}\/>/g" $wikiFile
	refsFlag=1
	
    done
    if [[ refsFlag -eq 1 ]] ; then
	echo -e "== References ==\n<references />" >> $wikiFile
    fi
    
    sed -ri 's/[[]localhost#tab[^ ]* ([0-9.]+)[]]/\1/g' ${wikiFile}
  
    # remove section numbering
    sed -i 's/^\(=\+\) *[0-9\.]\+ *\(.*[^ ]\) *\1/\1 \2 \1/' ${wikiFile}
    
    # general title of the current page.
    local title=$( sed -n 's/^\(=\+\) *\(.*[^ ]\) *\1/\2/p' ${wikiFile} \
	| head -n 1 )
    

    # Fix image refs.
    egrep -o '\[localhost#fig:[^ ]+ [0-9\.]+\]' ${wikiFile} \
	| while read raw_label raw_num ; do
	
	original_ref="${raw_label} ${raw_num}"
	original_ref_sed=${original_ref/\[/\\\[}
	original_ref_sed=${original_ref_sed/\]/\\\]}
	original_ref_sed=${original_ref_sed/\./\\\.}
	
	ref_label=${raw_label/\[localhost#/}
	ref_num=${raw_num/\]/}
	
	new_ref_sed="\[\[#${ref_label}|${ref_num}\]\]"
	
	sed -rni "
          /'''Figure ${ref_num}:'''/ {
            : found_figure
              $ b end
              N
              /<div align=\"CENTER\">/ {
                s/<div/<div id=\"${ref_label}\"/
                b end
              }
              b found_figure
          }
          : end
            p
            d" ${wikiFile}
	
	sed -i "s/${original_ref_sed}/${new_ref_sed}/" ${wikiFile}	
    done


    # split oversized wiki pages
    local currentPart=0
    get_file_size "$wikiFile"
    local wikiSize=$retval

    if [ ${wikiSize} -gt ${maxPageSize} ] ; then	
	splitfile=$( basename "${wikiFile}" )
	
        # compute number of parts that we are going to divide the file to
	local splitN=2
	while (( wikiSize/splitN > maxPageSize*2/3 )) ; do
	    ((splitN++))
	done  
	
        # compute the size of parts that we are aiming for
	local splitSizeTarget=$((wikiSize/splitN))
	
        # echo "split size data: $splitN $splitSizeTarget" 
        #     "${wikiSize} $maxPageSize"
	
        # Splitting: 
        # The algorithm goes through the file line by line and splits the file
        # when the current file size exceeds a certain limit:
        # If the point is good, i.e. a top-level header, split sooner.
        # 1) = a section header =, starting from 3/4 of target size
        # 2) === a lower header === (arbitrary level), starting from target size
        # 3) any line, starting from 9/10 maximum size

	local currentFName="${wikiFile}${currentPart}"
	local currentSize=0
	while read line ; do
        #0: not a header, >0: a header line
	    local isHeader=$( echo "${line}" \
		| sed -n 's/^=[^=]*= *$/&/p' | wc -m )
	    local isSubHeader=$( echo "${line}" \
		| sed -n 's/^\(=\+\)[^=]*\1 *$/&/p' | wc -m )
	    
            # check split point        
	    if ((currentPart==0 \
		|| currentSize>splitSizeTarget*3/4 && isHeader \
                || currentSize>splitSizeTarget && isSubHeader \
                || currentSize>maxPageSize*9/10)) ; then
		((currentPart++))
		currentFName="${wikiFile}${currentPart}"

		if ((currentPart>1)) ; then 
		    # add a special header
		    echo "= ${title} part ${currentPart} =" \
			> ${currentFName}
                    # first read line of part n
		    echo "${line}" >> ${currentFName} 
		else
                    # first line of part 1
		    echo "${line}" > ${currentFName}
		fi
	    else
                # nth line of part x
		echo "${line}" >> ${currentFName}
	    fi
            
	    get_file_size ${currentFName}
	    currentSize=${retval}
	done < ${wikiFile}
	
        # if the last part is small, append it to the second last part
	get_file_size ${currentFName}
	local lastSize=${retval}
	local file2Name=${wikiFile}$( expr ${currentPart} - 1 )
	get_file_size ${file2Name}
	local file2Size=${retval}
	if ((lastSize + file2Size < maxPageSize*5/6)) ; then
	    cat ${currentFName} >> ${file2Name}
	    rm ${currentFName}
	    ((currentPart--))
	fi
	
	rm ${wikiFile}
    fi
    
    # print file name and title, and part names for a split file
    echo "${rWikiFile}: ${title}"
    for ((i=1; i<=currentPart; i++)) ; do
	echo "  ${rWikiFile}${i}"
    done
}


# ---- "main" part of the script --------------------------------

YES="yes"
NO="no"

USE_TEMPLATE_NOTE=${NO}

# handles command line options
if [ $# -ne 0 ] ; then
    while [ $# -gt 0 ] ; do
	case $1 in
	    --in) 
		if [[ $# -lt 2 || ! -e $2 ]]; then
		    echo "Please give an input dir or file."
		    exit 1
		else
		    inpath=$2
		fi
		shift 2
		;;

	    --out) 
		if [[ $# -lt 2 ]]; then
                    echo "Please give an output dir."
                    exit 1
                fi
                outpath=$2
                shift 2
		;;

	    --pagesize) 
		if [[ $# -lt 2 || $2 != [0-9]* ]]; then
                    echo "Please give a numeric max. page size."
                    exit 1
                fi
                maxPageSize=$2
                shift 2
		;;

	    --help) 
		help_func
                exit 0
		;;

            "--use-template-note")
		USE_TEMPLATE_NOTE=${YES}
		shift
		;;

            *) 
		echo "Unknown option: $1"
                help_func
                exit 1
		;;
	esac
    done
else
    help_func
    exit 1
fi

set +u
if [ "x${inpath}" == "x" ] ; then
    echo "No input path set, please use --in."
    exit 1
elif [ "x${outpath}" == "x" ] ; then
    echo "No output path set, please use --out."
    exit 1
fi

# start the action!

# convert a single file
if [ -f ${inpath} ] ; then 
    wikify_file ${inpath} ${outpath} ${maxPageSize}

# convert a folder of files
elif [ -d ${inpath} ] ; then
    
    if [ -f ${outpath} ] ; then
	echo "Can not write to ${outpath}."
	exit 1
    fi

    [ -d ${outpath} ] || mkdir -p ${outpath}

    html_files=$( find $inpath -maxdepth 1 -name "*.html" )

    if [ "x${html_files}" == "x" ] ; then
	echo "No html files found!"
	exit 1
    fi

    for file in ${html_files} ; do
	wikify_file \
	    ${file} \
	    "${outpath}/$( basename ${file%.html} ).wiki" \
	    $maxPageSize
    done
    
    cp -f ${inpath}/*.png ${inpath}/*.jpg ${outpath} 2>/dev/null
    
else
    echo "Please give a valid file or folder with --in <path | html file>"
    exit 1
fi

cat <<EOF
Wikification complete.

NOTE: The wiki files created by this script are not perfect and should be 
      manually checked.

EOF
exit 0    

