# qclip - enqueue the clipboard's text into the next file in a clips directory # # Copyright (C) 2005 Brian Holtz brian@holtz.org # # This software is redistributable under the terms of the # Creative Commons Attribution-ShareAlike license 2.0 # at http://creativecommons.org/licenses/by-sa/2.0/ # # I use qclip to load my PocketPC with news articles and web pages that # Fonix iSpeak for PocketPC reads to me. First I load the clipboard via # * Right-click article's URL, choose "Copy Shortcut", or # * Left-click article's URL, then Ctrl-A, then Ctrl-C # Then I click an icon that runs qclip, and zap -- the body of the article # is now a text file in a folder automatically sync'd to my PocketPC, # where Fonix iSpeak has a playlist that includes all such files. # # Usage: qclip dir # 1) qclip sorts the text files in dir by name and renames them like # 00.txt 01.txt 02.txt # 2) qclip retrieves the clipboard text. If the text is a URL, qclip # fetches the URL and uses that text. # 3) qclip attempts to strip any of the headers and footers it knows about. # (Add extra headers and footers to the arrays immediately below.) # 4) qlip writes the text to a new file in the dir e.g. 03.txt # # XXX The popups of some sites (mercurynews.com) foil the URL fetching feature? # XXX For multipage articles, it would be cool to scan for a well-known # "print this page" link and substitute the text of the linked-to page. use warnings; use strict; # If this substring appears in the first half of the text, delete up to its end my @headers2Strip = ( "--------------------------", "Wired News Wire service news & photos Wired Magazine HotBot (the Web)", " El Cato", # cato.org "Donate Today!", # socialsecurity.org " »", "| Main", # typepad "Posted on", # mercurynews.com "Email page", # "Syndicate: xml or rdf", # "Reprints", # nytimes.com "Font Size:", # techcentralstation.com "Site Feed", # scrivener.net "HotBot (the Web)", # wired.com ); # If this substring appears in the last half of the text, delete it and the rest my @footers2Strip = ( "--------------------------", "___ ", # Yahoo news "Email Story", # Yahoo news "Printer Friendly Version", # "Printer friendly", # "Post a comment", # movable type "Next Article in", # nytimes.com "Ads by Google", # latimes.com "Copyright 2005 The New York Times Company", "Reason Public Policy Institute is a public policy think tank", "Have a comment on this article?", "Sign up for Sacramento Bee newsletters", ); # Preview the die string for 10 seconds before the Windows terminal goes away $SIG{'__DIE__'} = sub { print( "$_[0]\n" ); sleep 10; die $_[0]; }; use Win32::Clipboard; sub RenumberTextFilesIn( $ ) { my $dir = shift(@_); opendir(DIR, $dir) || die "cannot opendir $dir: $!"; my @files = readdir(DIR); @files = sort( @files ); my $basename = "00"; foreach my $file ( @files ) { my $path = "$dir/$file"; if ( -d $path ) { next; } if ( substr( $file, length( $file ) - 4 ) ne ".txt" ) { next; } rename( "$dir/$file", "$dir/$basename.txt" ) || die "cannot rename $dir/$file to $dir/$basename.txt: $!"; $basename = ++($basename); } closedir DIR; } sub NextTextFileIn( $ ) { my $dir = shift(@_); opendir(DIR, $dir) || die "cannot opendir $dir: $!"; my @files = readdir(DIR); my @textfiles; foreach my $file ( @files ) { my $path = "$dir/$file"; if ( -d $path ) { next; } if ( substr( $file, length( $file ) - 4 ) ne ".txt" ) { next; } @textfiles = (@textfiles, $file); } @textfiles = sort( @textfiles ); my $basename = $textfiles[ $#textfiles ]; $basename =~ s/\.txt$//; return "$dir/" . ++($basename) . ".txt"; closedir DIR; } sub TrimText( $ ) { my $text = shift(@_); foreach my $headerKey ( @headers2Strip ) { my $headerPos = rindex( $text, $headerKey, length( $text ) / 2 ); if ( $headerPos < 0 ) { next; } #print "beheading at " . $headerPos . ": " . $headerKey . "\n"; $text = substr( $text, $headerPos + length( $headerKey )); } foreach my $footerKey ( @footers2Strip ) { my $footerPos = index( $text, $footerKey, length( $text ) / 2 ); if ( $footerPos < 0 ) { next; } #print "defooting at " . $footerPos . ": " . $footerKey . "\n"; $text = substr( $text, 0, $footerPos ); } # Clean up excess whitespace $text =~ s/\r//sg; # carriage returns $text =~ s/^[ \n]*//sg; # leading whitespace $text =~ s/\n\n\n+/\n\n/sg; # multiple blank lines $text =~ s/[ \n]+$//sg; # trailing whitespace return $text; } # cf. http://pleac.sourceforge.net/pleac_perl/webautomation.html use LWP::Simple; use HTML::TreeBuilder; use HTML::FormatText; sub GetPrintableText( $ ) { my $url = shift(@_); my $html = LWP::Simple::get( $url ); if (! defined( $html ) || length( $html ) <= 0) { die "could not get $url"; } my $tree = HTML::TreeBuilder->new(); $tree->parse( $html ); $tree->eof(); my $formatter = HTML::FormatText->new( leftmargin => 0 ); my $ascii = $formatter->format( $tree ); return $ascii; } my $clipdir=$ARGV[0]; RenumberTextFilesIn( $clipdir ); my $clipfile = NextTextFileIn( $clipdir ); my $clipBoard = Win32::Clipboard(); my $clipText = $clipBoard->Get(); if ( substr( $clipText, 0, 4 ) eq "http" && length( $clipText ) < 1000 ) { # GetTitleBytes( $clipText ); exit 0; $clipText = GetPrintableText( $clipText ); } $clipText = TrimText( $clipText ); #print $clipText; exit 0; open(OUTFILE, '>', $clipfile); print( OUTFILE $clipText ); close( OUTFILE ); # Preview the top and bottom of the trimmed text for 5 seconds $| = 1; # autoflush my $top = $clipText; $top = substr( $clipText, 0, 300 ); $top = substr( $top, 0, rindex( $top, "\n" )-1 ); print $top; print "\n...\n"; my $bot = $clipText; $bot = substr( $clipText, length( $clipText ) - 300 ); $bot = substr( $bot, index( $bot, "\n" )+1 ); print $bot; sleep 5;