msysgit/bin/docx2txt

#!/usr/bin/env perl

# docx2txt, a command-line utility to convert Docx documents to text format.
# Copyright (C) 2008-2009 Sandeep Kumar
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

#
# This script extracts text from document.xml contained inside .docx file.
# Perl v5.8.2 was used for testing this script.
#
# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
#
# ChangeLog :
#
#    10/08/2008 - Initial version (v0.1)
#    15/08/2008 - Script takes two arguments [second optional] now and can be
#                 used independently to extract text from docx file. It accepts
#                 docx file directly, instead of xml file.
#    18/08/2008 - Added support for center and right justification of text that
#                 fits in a line 80 characters wide (adjustable).
#    03/09/2008 - Fixed the slip in usage message.
#    12/09/2008 - Slightly changed the script invocation and argument handling
#                 to incorporate some of the shell script functionality here.
#                 Added support to handle embedded urls in docx document.
#    23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
#                 Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
#                 during installation.
#    31/08/2009 - Added support for handling more escape characters.
#                 Using OS specific null device to redirect stderr.
#                 Saving text file in binary mode.
#    03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
#                 (sergei>AT<dewia>DOT<com).
#                 - removal of non-document text in between TOC related tags.
#                 - display of hyperlink alongside linked text user controlled.
#                 - some character conversion updates
#    05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
#                 Added more character conversions.
#                 Organised conversion mappings in tabular form for speedup and
#                 easy maintenance.
#                 Tweaked code to reduce number of passes over document content.
#    10/09/2009 - For leaner text experience, hyperlink is not displayed if
#                 hyperlink and hyperlinked text are same, even if user has
#                 enabled hyperlink display.
#                 Improved handling of short line justification. Many
#                 justification tag patterns were not captured earlier.
#    11/09/2009 - A directory holding the unzipped content of .docx file can
#                 also be specified as argument to the script, in place of file.
#    17/09/2009 - Removed trailing slashes from input directory name.
#                 Updated unzip command invocations to handle path names
#                 containing spaces.
#    01/10/2009 - Added support for configuration file.
#    02/10/2009 - Using single quotes to specify path for unzip command. 
#    04/10/2009 - Corrected configuration option name lineIndent to listIndent.
#


#
# The default settings below can be overridden via docx2txt.config - searched
# first in current directory and then in the same location as this script.
#

our $unzip = '/usr/bin/unzip';	# Windows path like 'C:/path/to/unzip.exe'
our $newLine = "\n";		# Alternative is "\r\n".
our $listIndent = "  ";		# Indent nested lists by "\t", " " etc.
our $lineWidth = 80;		# Line width, used for short line justification.
our $showHyperLink = "N";	# Show hyperlink alongside linked text.


# ToDo: Better list handling. Currently assumed 8 level nesting.
my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');

#
# Character conversion tables
#

# Only amp, gt and lt are required for docx escapes, others are used for better
# text experience.
my %escChrs = (	amp => '&', gt => '>', lt => '<',
		acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
		laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
		reg => '(R)', shy => '-', times => 'x'
);

my %splchars = (
	"\xC2\xA0" => ' ',		# <nbsp>
	"\xC2\xA6" => '|',		# <brokenbar>
	"\xC2\xA9" => '(C)',		# <copyright>
	"\xC2\xAB" => '<<',		# <laquo>
	"\xC2\xAC" => '-',		# <negate>
	"\xC2\xAE" => '(R)',		# <regd>
	"\xC2\xB1" => '+-',		# <plusminus>
	"\xC2\xBB" => '>>',		# <raquo>

#	"\xC2\xA7" => '',		# <section>
#	"\xC2\xB6" => '',		# <para>

	"\xC3\x97" => 'x',		# <mul>
	"\xC3\xB7" => '/',		# <div>

	"\xE2\x80\x82" => '  ',		# <enspc>
	"\xE2\x80\x83" => '  ',		# <emspc>
	"\xE2\x80\x85" => ' ',		# <qemsp>
	"\xE2\x80\x93" => ' - ',	# <endash>
	"\xE2\x80\x94" => ' -- ',	# <emdash>
	"\xE2\x80\x98" => '`',		# <soq>
	"\xE2\x80\x99" => '\'',		# <scq>
	"\xE2\x80\x9C" => '"',		# <doq>
	"\xE2\x80\x9D" => '"',		# <dcq>
	"\xE2\x80\xA2" => '::',		# <diamond symbol>
	"\xE2\x80\xA6" => '...',	# <ellipsis>

	"\xE2\x84\xA2" => '(TM)',	# <trademark>

	"\xE2\x89\xA0" => '!=',		# <neq>
	"\xE2\x89\xA4" => '<=',		# <leq>
	"\xE2\x89\xA5" => '>=',		# <geq>

	#
	# Currency symbols
	#
	"\xC2\xA2" => 'cent',
	"\xC2\xA3" => 'Pound',
	"\xC2\xA5" => 'Yen',
	"\xE2\x82\xAC" => 'Euro'
);


#
# Check argument(s) sanity.
#

my $usage = <<USAGE;

Usage:	$0 <infile.docx> [outfile.txt|-]

	Use '-' as the outfile name to dump the text on STDOUT.
	Output is saved in infile.txt if second argument is omitted.

	infile.docx can also be a directory name holding the unzipped content
	of concerned .docx file.

USAGE

die $usage if (@ARGV == 0 || @ARGV > 2);


#
# Check for existence and readability of required file in specified directory,
# and whether it is a text file.
#

sub check_for_required_file_in_folder {
    stat("$_[1]/$_[0]");
    die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
    die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
}

sub readFileInto
{
  local $/ = undef;
  open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
  binmode $fh;
  $_[1] = <$fh>;
  close $fh;
}


#
# Check whether first argument is specifying a directory holding extracted
# content of .docx file, or .docx file itself.
#

stat($ARGV[0]);

if (-d _) {
    check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
    check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
    $inpIsDir = 'y';
}
else {
    die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
    die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
}


#
# Get user configuration, if any.
#

my %config;

if (-f "docx2txt.config") {
    %config = do 'docx2txt.config';
} elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) {
    %config = do "$1docx2txt.config" if (-f "$1docx2txt.config");
}

if (%config) {
    foreach my $var (keys %config) {
        $$var = $config{$var};
    }
}


#
# Extract xml document content from argument docx file/directory.
#

if ($ENV{OS} =~ /^Windows/) {
    $nulldevice = "nul";
} else {
    $nulldevice = "/dev/null";
}

if ($inpIsDir eq 'y') {
    readFileInto("$ARGV[0]/word/document.xml", $content);
} else {
    $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
}

die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;


#
# Be ready for outputting the extracted text contents.
#

if (@ARGV == 1) {
     $ARGV[1] = $ARGV[0];

     # Remove any trailing slashes to generate proper output filename, when
     # input is directory.
     $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');

     $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
}

my $txtfile;
open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
binmode $txtfile;    # Ensure no auto-conversion of '\n' to '\r\n' on Windows.


#
# Gather information about header, footer, hyperlinks, images, footnotes etc.
#

if ($inpIsDir eq 'y') {
    readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
} else {
    $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
}

my %docurels;
while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
{
    $docurels{"$2:$1"} = $3;
}


#
# Subroutines for center and right justification of text in a line.
#

sub justify {
    my $len = length $_[1];

    if ($_[0] eq "center" && $len < ($lineWidth - 1)) {
        return ' ' x (($lineWidth - $len) / 2) . $_[1];
    } elsif ($_[0] eq "right" && $len < $lineWidth) {
        return ' ' x ($lineWidth - $len) . $_[1];
    } else {
        return $_[1];
    }
}

#
# Subroutines for dealing with embedded links and images
#

sub hyperlink {
    my $hlrid = $_[0];
    my $hltext = $_[1];
    my $hlink = $docurels{"hyperlink:$hlrid"};

    $hltext =~ s/<[^>]*?>//og;
    $hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);

    return $hltext;
}

#
# Subroutines for processing paragraph content.
#

sub processParagraph {
    my $para = $_[0] . "$newLine";
    my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);

    $para =~ s/<.*?>//og;
    return justify($align,$para) if $align;

    return $para;
}


#
# Force configuration value to lowercase as expected by script.
#
$showHyperLink = lc $showHyperLink;


#
# Text extraction starts.
#

my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");

$content =~ s/<?xml .*?\?>(\r)?\n//;

# Remove stuff between TOC related tags.
if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
    $content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
}

$content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;

my $hr = '-' x $lineWidth . $newLine;
$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;

$content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . "$levchar[$1] "|oge;

#
# Uncomment either of below two lines and comment above line, if dealing
# with more than 8 level nested lists.
#

# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . '* '|oge;
# $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;

$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;

$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;

$content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;

$content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$newLine|og;
$content =~ s/<.*?>//og;


#
# Convert non-ASCII characters/character sequences to ASCII characters.
#

$content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;

#
# Convert docx specific escape chars first.
#
$content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;

#
# Another pass for a better text experience, after sequences like "&amp;laquo;"
# are converted to "&laquo;".
#
$content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;


#
# Write the extracted and converted text contents to output.
#

print $txtfile $content;
close $txtfile;
Add a textconv filter for docx files Signed-off-by: Albert Krawczyk <pro-logic@optusnet.com.au> Improved-by: Sebastian Schuberth <sschuberth@gmail.com> 2011-12-05 12:19:19 +04:00			`#!/usr/bin/env perl`

			`# docx2txt, a command-line utility to convert Docx documents to text format.`
			`# Copyright (C) 2008-2009 Sandeep Kumar`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation; either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`

			`#`
			`# This script extracts text from document.xml contained inside .docx file.`
			`# Perl v5.8.2 was used for testing this script.`
			`#`
			`# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)`
			`#`
			`# ChangeLog :`
			`#`
			`# 10/08/2008 - Initial version (v0.1)`
			`# 15/08/2008 - Script takes two arguments [second optional] now and can be`
			`# used independently to extract text from docx file. It accepts`
			`# docx file directly, instead of xml file.`
			`# 18/08/2008 - Added support for center and right justification of text that`
			`# fits in a line 80 characters wide (adjustable).`
			`# 03/09/2008 - Fixed the slip in usage message.`
			`# 12/09/2008 - Slightly changed the script invocation and argument handling`
			`# to incorporate some of the shell script functionality here.`
			`# Added support to handle embedded urls in docx document.`
			`# 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from`
			`# Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work`
			`# during installation.`
			`# 31/08/2009 - Added support for handling more escape characters.`
			`# Using OS specific null device to redirect stderr.`
			`# Saving text file in binary mode.`
			`# 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov`
			`# (sergei>AT<dewia>DOT<com).`
			`# - removal of non-document text in between TOC related tags.`
			`# - display of hyperlink alongside linked text user controlled.`
			`# - some character conversion updates`
			`# 05/09/2009 - Merged cjustify and rjustify into single subroutine justify.`
			`# Added more character conversions.`
			`# Organised conversion mappings in tabular form for speedup and`
			`# easy maintenance.`
			`# Tweaked code to reduce number of passes over document content.`
			`# 10/09/2009 - For leaner text experience, hyperlink is not displayed if`
			`# hyperlink and hyperlinked text are same, even if user has`
			`# enabled hyperlink display.`
			`# Improved handling of short line justification. Many`
			`# justification tag patterns were not captured earlier.`
			`# 11/09/2009 - A directory holding the unzipped content of .docx file can`
			`# also be specified as argument to the script, in place of file.`
			`# 17/09/2009 - Removed trailing slashes from input directory name.`
			`# Updated unzip command invocations to handle path names`
			`# containing spaces.`
			`# 01/10/2009 - Added support for configuration file.`
			`# 02/10/2009 - Using single quotes to specify path for unzip command.`
			`# 04/10/2009 - Corrected configuration option name lineIndent to listIndent.`
			`#`


			`#`
			`# The default settings below can be overridden via docx2txt.config - searched`
			`# first in current directory and then in the same location as this script.`
			`#`

			`our $unzip = '/usr/bin/unzip'; # Windows path like 'C:/path/to/unzip.exe'`
			`our $newLine = "\n"; # Alternative is "\r\n".`
			`our $listIndent = " "; # Indent nested lists by "\t", " " etc.`
			`our $lineWidth = 80; # Line width, used for short line justification.`
			`our $showHyperLink = "N"; # Show hyperlink alongside linked text.`


			`# ToDo: Better list handling. Currently assumed 8 level nesting.`
			`my @levchar = ('', '+', 'o', '-', '*', '++', 'oo', '--');`

			`#`
			`# Character conversion tables`
			`#`

			`# Only amp, gt and lt are required for docx escapes, others are used for better`
			`# text experience.`
			`my %escChrs = ( amp => '&', gt => '>', lt => '<',`
			`acute => '\'', brvbar => '\|', copy => '(C)', divide => '/',`
			`laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',`
			`reg => '(R)', shy => '-', times => 'x'`
			`);`

			`my %splchars = (`
			`"\xC2\xA0" => ' ', # <nbsp>`
			`"\xC2\xA6" => '\|', # <brokenbar>`
			`"\xC2\xA9" => '(C)', # <copyright>`
			`"\xC2\xAB" => '<<', # <laquo>`
			`"\xC2\xAC" => '-', # <negate>`
			`"\xC2\xAE" => '(R)', # <regd>`
			`"\xC2\xB1" => '+-', # <plusminus>`
			`"\xC2\xBB" => '>>', # <raquo>`

			`# "\xC2\xA7" => '', # <section>`
			`# "\xC2\xB6" => '', # <para>`

			`"\xC3\x97" => 'x', # <mul>`
			`"\xC3\xB7" => '/', # <div>`

			`"\xE2\x80\x82" => ' ', # <enspc>`
			`"\xE2\x80\x83" => ' ', # <emspc>`
			`"\xE2\x80\x85" => ' ', # <qemsp>`
			`"\xE2\x80\x93" => ' - ', # <endash>`
			`"\xE2\x80\x94" => ' -- ', # <emdash>`
			"\xE2\x80\x98" => '`', # <soq>
			`"\xE2\x80\x99" => '\'', # <scq>`
			`"\xE2\x80\x9C" => '"', # <doq>`
			`"\xE2\x80\x9D" => '"', # <dcq>`
			`"\xE2\x80\xA2" => '::', # <diamond symbol>`
			`"\xE2\x80\xA6" => '...', # <ellipsis>`

			`"\xE2\x84\xA2" => '(TM)', # <trademark>`

			`"\xE2\x89\xA0" => '!=', # <neq>`
			`"\xE2\x89\xA4" => '<=', # <leq>`
			`"\xE2\x89\xA5" => '>=', # <geq>`

			`#`
			`# Currency symbols`
			`#`
			`"\xC2\xA2" => 'cent',`
			`"\xC2\xA3" => 'Pound',`
			`"\xC2\xA5" => 'Yen',`
			`"\xE2\x82\xAC" => 'Euro'`
			`);`


			`#`
			`# Check argument(s) sanity.`
			`#`

			`my $usage = <<USAGE;`

			`Usage: $0 <infile.docx> [outfile.txt\|-]`

			`Use '-' as the outfile name to dump the text on STDOUT.`
			`Output is saved in infile.txt if second argument is omitted.`

			`infile.docx can also be a directory name holding the unzipped content`
			`of concerned .docx file.`

			`USAGE`

			`die $usage if (@ARGV == 0 \|\| @ARGV > 2);`


			`#`
			`# Check for existence and readability of required file in specified directory,`
			`# and whether it is a text file.`
			`#`

			`sub check_for_required_file_in_folder {`
			`stat("$_[1]/$_[0]");`
			`die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);`
			`die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;`
			`}`

			`sub readFileInto`
			`{`
			`local $/ = undef;`
			`open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";`
			`binmode $fh;`
			`$_[1] = <$fh>;`
			`close $fh;`
			`}`


			`#`
			`# Check whether first argument is specifying a directory holding extracted`
			`# content of .docx file, or .docx file itself.`
			`#`

			`stat($ARGV[0]);`

			`if (-d _) {`
			`check_for_required_file_in_folder("word/document.xml", $ARGV[0]);`
			`check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);`
			`$inpIsDir = 'y';`
			`}`
			`else {`
			`die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);`
			`die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;`
			`}`


			`#`
			`# Get user configuration, if any.`
			`#`

			`my %config;`

			`if (-f "docx2txt.config") {`
			`%config = do 'docx2txt.config';`
			`} elsif ($0 =~ m%^(.[/\\])[^/\\]?$%) {`
			`%config = do "$1docx2txt.config" if (-f "$1docx2txt.config");`
			`}`

			`if (%config) {`
			`foreach my $var (keys %config) {`
			`$$var = $config{$var};`
			`}`
			`}`


			`#`
			`# Extract xml document content from argument docx file/directory.`
			`#`

			`if ($ENV{OS} =~ /^Windows/) {`
			`$nulldevice = "nul";`
			`} else {`
			`$nulldevice = "/dev/null";`
			`}`

			`if ($inpIsDir eq 'y') {`
			`readFileInto("$ARGV[0]/word/document.xml", $content);`
			`} else {`
			$content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
			`}`

			`die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;`


			`#`
			`# Be ready for outputting the extracted text contents.`
			`#`

			`if (@ARGV == 1) {`
			`$ARGV[1] = $ARGV[0];`

			`# Remove any trailing slashes to generate proper output filename, when`
			`# input is directory.`
			`$ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');`

			`$ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);`
			`}`

			`my $txtfile;`
			`open($txtfile, "> $ARGV[1]") \|\| die "Can't create <$ARGV[1]> for output!\n";`
			`binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.`


			`#`
			`# Gather information about header, footer, hyperlinks, images, footnotes etc.`
			`#`

			`if ($inpIsDir eq 'y') {`
			`readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);`
			`} else {`
			$_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
			`}`

			`my %docurels;`
			`while (/<Relationship Id="(.?)" Type=".?\/([^\/]?)" Target="(.?)"( .*?)?\/>/g)`
			`{`
			`$docurels{"$2:$1"} = $3;`
			`}`


			`#`
			`# Subroutines for center and right justification of text in a line.`
			`#`

			`sub justify {`
			`my $len = length $_[1];`

			`if ($_[0] eq "center" && $len < ($lineWidth - 1)) {`
			`return ' ' x (($lineWidth - $len) / 2) . $_[1];`
			`} elsif ($_[0] eq "right" && $len < $lineWidth) {`
			`return ' ' x ($lineWidth - $len) . $_[1];`
			`} else {`
			`return $_[1];`
			`}`
			`}`

			`#`
			`# Subroutines for dealing with embedded links and images`
			`#`

			`sub hyperlink {`
			`my $hlrid = $_[0];`
			`my $hltext = $_[1];`
			`my $hlink = $docurels{"hyperlink:$hlrid"};`

			`$hltext =~ s/<[^>]*?>//og;`
			`$hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);`

			`return $hltext;`
			`}`

			`#`
			`# Subroutines for processing paragraph content.`
			`#`

			`sub processParagraph {`
			`my $para = $_[0] . "$newLine";`
			`my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);`

			`$para =~ s/<.*?>//og;`
			`return justify($align,$para) if $align;`

			`return $para;`
			`}`


			`#`
			`# Force configuration value to lowercase as expected by script.`
			`#`
			`$showHyperLink = lc $showHyperLink;`


			`#`
			`# Text extraction starts.`
			`#`

			`my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");`

			`$content =~ s/<?xml .*?\?>(\r)?\n//;`

			`# Remove stuff between TOC related tags.`
			`if ($content =~ m\|<w:pStyle w:val="TOCHeading"/>\|) {`
			`$content =~ s\|<w:instrText[^>]>.?</w:instrText>\|\|og;`
			`}`

			`$content =~ s{<w:(tab\|noBreakHyphen\|softHyphen)/>}\|$tag2chr{$1}\|og;`

			`my $hr = '-' x $lineWidth . $newLine;`
			`$content =~ s\|<w:pBdr>.*?</w:pBdr>\|$hr\|og;`

			`$content =~ s\|<w:numPr><w:ilvl w:val="([0-9]+)"/>\|$listIndent x $1 . "$levchar[$1] "\|oge;`

			`#`
			`# Uncomment either of below two lines and comment above line, if dealing`
			`# with more than 8 level nested lists.`
			`#`

			`# $content =~ s\|<w:numPr><w:ilvl w:val="([0-9]+)"/>\|$listIndent x $1 . '* '\|oge;`
			`# $content =~ s\|<w:numPr><w:ilvl w:val="([0-9]+)"/>\|'*' x ($1+1) . ' '\|oge;`

			`$content =~ s{<w:caps/>.?(<w:t>\|<w:t [^>]+>)(.?)</w:t>}/uc $2/oge;`

			`$content =~ s{<w:hyperlink r:id="(.?)".?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;`

			`$content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;`

			`$content =~ s{<w:p [^/>]+?/>\|</w:p>\|<w:br/>}\|$newLine\|og;`
			`$content =~ s/<.*?>//og;`


			`#`
			`# Convert non-ASCII characters/character sequences to ASCII characters.`
			`#`

			`$content =~ s/(\xE2..\|\xC2.\|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;`

			`#`
			`# Convert docx specific escape chars first.`
			`#`
			`$content =~ s/(&)(amp\|gt\|lt)(;)/$escChrs{lc $2}/iog;`

			`#`
			`# Another pass for a better text experience, after sequences like "&laquo;"`
			`# are converted to "«".`
			`#`
			`$content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;`


			`#`
			`# Write the extracted and converted text contents to output.`
			`#`

			`print $txtfile $content;`
			`close $txtfile;`