# qclip - enqueue the clipboard's text into the next file in a clips directory
#
# Copyright (C) 2005 Brian Holtz  brian@holtz.org
#
# This software is redistributable under the terms of the
# Creative Commons Attribution-ShareAlike license 2.0
# at http://creativecommons.org/licenses/by-sa/2.0/
#
# I use qclip to load my PocketPC with news articles and web pages that
# Fonix iSpeak for PocketPC reads to me. First I load the clipboard via
#   * Right-click article's URL, choose "Copy Shortcut", or
#   * Left-click article's URL, then Ctrl-A, then Ctrl-C
# Then I click an icon that runs qclip, and zap -- the body of the article
# is now a text file in a folder automatically sync'd to my PocketPC,
# where Fonix iSpeak has a playlist that includes all such files.
#
# Usage: qclip dir
# 1) qclip sorts the text files in dir by name and renames them like
#    00.txt 01.txt 02.txt
# 2) qclip retrieves the clipboard text. If the text is a URL, qclip
#    fetches the URL and uses that text.
# 3) qclip attempts to strip any of the headers and footers it knows about.
#    (Add extra headers and footers to the arrays immediately below.)
# 4) qlip writes the text to a new file in the dir e.g. 03.txt
#
# XXX The popups of some sites (mercurynews.com) foil the URL fetching feature?
# XXX For multipage articles, it would be cool to scan for a well-known
#     "print this page" link and substitute the text of the linked-to page.

use warnings; use strict;

# If this substring appears in the first half of the text, delete up to its end
my @headers2Strip = (
	"--------------------------",
	"Wired News Wire service news & photos Wired Magazine HotBot (the Web)",
	" El Cato",			# cato.org
	"Donate Today!",		# socialsecurity.org
	" ť", "| Main",			# typepad
	"Posted on",			# mercurynews.com
	"Email page",			#
	"Syndicate: xml or rdf",	#
	"Reprints",			# nytimes.com
	"Font Size:",			# techcentralstation.com
	"Site Feed",			# scrivener.net
	"HotBot (the Web)",		# wired.com
);

# If this substring appears in the last half of the text, delete it and the rest
my @footers2Strip = (
	"--------------------------",
	"___ ",				# Yahoo news
	"Email Story",			# Yahoo news
	"Printer Friendly Version",	#
	"Printer friendly",		#
	"Post a comment",		# movable type
	"Next Article in",		# nytimes.com
	"Ads by Google",		# latimes.com
	"Copyright 2005 The New York Times Company",
	"Reason Public Policy Institute is a public policy think tank",
	"Have a comment on this article?",
	"Sign up for Sacramento Bee newsletters",
);

# Preview the die string for 10 seconds before the Windows terminal goes away
$SIG{'__DIE__'} =
	sub {
		print( "$_[0]\n" ); sleep 10; die $_[0];
	};

use Win32::Clipboard;

sub RenumberTextFilesIn( $ ) {
	my $dir = shift(@_);
	opendir(DIR, $dir) || die "cannot opendir $dir: $!";
	my @files = readdir(DIR);
	@files = sort( @files );
	my $basename = "00";
	foreach my $file ( @files ) {
		my $path = "$dir/$file";
		if ( -d $path ) { next; }
		if ( substr( $file, length( $file ) - 4 ) ne ".txt" ) { next; }
		rename( "$dir/$file", "$dir/$basename.txt" )
			|| die "cannot rename $dir/$file to $dir/$basename.txt: $!";
		$basename = ++($basename);
	}
	closedir DIR;
}

sub NextTextFileIn( $ ) {
	my $dir = shift(@_);
	opendir(DIR, $dir) || die "cannot opendir $dir: $!";
	my @files = readdir(DIR);
	my @textfiles;
	foreach my $file ( @files ) {
		my $path = "$dir/$file";
		if ( -d $path ) { next; }
		if ( substr( $file, length( $file ) - 4 ) ne ".txt" ) { next; }
		@textfiles = (@textfiles, $file);
	}
	@textfiles = sort( @textfiles );
	my $basename = $textfiles[ $#textfiles ];
	$basename =~ s/\.txt$//;
	return "$dir/" . ++($basename) . ".txt";
	closedir DIR;
}

sub TrimText( $ ) {
	my $text = shift(@_);
	foreach my $headerKey ( @headers2Strip ) {
		my $headerPos = rindex( $text, $headerKey, length( $text ) / 2 );
		if ( $headerPos < 0 ) { next; }
		#print "beheading at " . $headerPos . ": " . $headerKey . "\n";
		$text = substr( $text, $headerPos + length( $headerKey ));
	}
	foreach my $footerKey ( @footers2Strip ) {
		my $footerPos = index( $text, $footerKey, length( $text ) / 2 );
		if ( $footerPos < 0 ) { next; }
		#print "defooting at " . $footerPos . ": " . $footerKey . "\n";
		$text = substr( $text, 0, $footerPos );
	}
	# Clean up excess whitespace
	$text =~ s/\r//sg;		# carriage returns
	$text =~ s/^[ \n]*//sg;		# leading whitespace
	$text =~ s/\n\n\n+/\n\n/sg;	# multiple blank lines
	$text =~ s/[ \n]+$//sg;		# trailing whitespace
	return $text;
}

# cf. http://pleac.sourceforge.net/pleac_perl/webautomation.html

use LWP::Simple;
use HTML::TreeBuilder;
use HTML::FormatText;

sub GetPrintableText( $ ) {
	my $url = shift(@_);
	my $html = LWP::Simple::get( $url );
	if (! defined( $html ) || length( $html ) <= 0) {
		die "could not get $url";
	}
	my $tree = HTML::TreeBuilder->new();
	$tree->parse( $html );
	$tree->eof();
	my $formatter = HTML::FormatText->new( leftmargin => 0 );
	my $ascii = $formatter->format( $tree );
	return $ascii;
}

my $clipdir=$ARGV[0];
RenumberTextFilesIn( $clipdir );
my $clipfile = NextTextFileIn( $clipdir );

my $clipBoard = Win32::Clipboard();
my $clipText = $clipBoard->Get();
if ( substr( $clipText, 0, 4 ) eq "http" && length( $clipText ) < 1000 ) {
	# GetTitleBytes( $clipText ); exit 0;
	$clipText = GetPrintableText( $clipText );
}
$clipText = TrimText( $clipText );

#print $clipText; exit 0;

open(OUTFILE, '>', $clipfile);
print( OUTFILE $clipText );
close( OUTFILE );

# Preview the top and bottom of the trimmed text for 5 seconds
$| = 1; # autoflush
my $top = $clipText;
$top = substr( $clipText, 0, 300 );
$top = substr( $top, 0, rindex( $top, "\n" )-1 );
print $top;
print "\n...\n";
my $bot = $clipText;
$bot = substr( $clipText, length( $clipText ) - 300 );
$bot = substr( $bot, index( $bot, "\n" )+1 );
print $bot;
sleep 5;