#!/usr/bin/perl -w
#
# nnw2vienna
#   Migrate NetNewsWire data to Vienna.  I use my RSS reader data
#   as a repository for unread and important (flagged) articles,
#   sort of like bookmarks in a web browser.  I realize that this
#   is somewhat against the design model of RSS aggregators, but
#   so be it.  Under these circumstances, it's difficult to switch
#   to a new RSS reader, because the new reader wouldn't know about
#   all of those archived articles (OPML exports only contain
#   subscription lists).
#
#   This is a migration script that I wrote to solve that problem,
#   for myself.  If it's also useful for you, then all the better.
#
#   Without options, this script will copy all UNREAD and FLAGGED
#   messages in your NetNewsWire/FeedData files to your Vienna
#   database.
#
# Version
#   0.1  08Nov2005 - release
#
# Usage
#   nnw2vienna [options]
#     --get-all        get all messages, even if already read
#     --skip-unread    don't get unread messages
#     --skip-marked    don't get marked messages
#     --direct-to-db   insert directly into db instead of writing
#                      to STDOUT (seems to be almost 30% slower
#                      than importing the SQL manually)
#
#     (these two options must be used together or not at all)
#        --file=filename  read articles from filename only
#        --dest-folder=id copy articles to folder_id id
#
# How to use this script
#   1. From NetNewsWire, export your feeds to an OPML file.
#   2. In Vienna, import the OPML file.
#   3. QUIT BOTH NETNEWSWIRE AND VIENNA
#   4. Run this script in one of two ways:
#      The first option is simpler but takes ~30% more time.
#      Recommended if you don't have many articles to copy.
#      (OPTION 1) run script and update DB (direct-to-db)
#         nnw2vienna [options] --direct-to-db
#      (OPTION 2) run script and update DB (manual import)
#         nnw2vienna [options] > ~/Desktop/nnw2vienna.sql
#         cd ~/Library/Application\ Support/Vienna
#         sqlite messages.db < ~/Desktop/nnw2vienna.sql
#   5. Start Vienna.  Hope it worked.
#
#   ** If you already have important data in Vienna, you might
#      want to make a copy of your messages.db file first.  
#
# Caveats
#   Quick hack one-time-use coding style, very little error
#      checking, almost certainly has unfixed bugs.
#   Seems to work on my NNW data (126 feeds, 10k messages),
#      but not tested on anyone else's.
#   Works with OSX 10.4.3, NNW 2.0b45, Vienna 2.0.0.2012.
#      Might not work with other versions.
#   Fixes and corrections welcome, but this code might not
#      be actively maintained -- no promises at this point.
#   For the above NNW data, this script took 4 minutes to
#      export the data to a file, but the manual import step
#      took 60 minutes.  Using the --direct-to-db option is
#      ~30% slower.  See Dependencies, below.  I tried to
#      improve performance by doing the insert on a hash of
#      arrays, but the DBI module seems to convert all
#      thusly-inserted values to strings (also, the time
#      reduction was minimal).  Alternate solutions welcomed.
#   While the DB inserts are happening, your machine will
#      be IO bound -- trying to get other things done that
#      require disk access will be less than rewarding.
#   If you really, really, really need help with this, you
#      can email me: reynhout@quesera.com.  But please
#      exhaust general package support avenues first.
#
# License
#   Do as thou wilt shall be the whole of the law.  If this
#      is useful to you in any way, please use it.  If you
#      want to rewrite it with fewer dependencies, a less
#      involved installation procedure, more optimization,
#      or fewer bugs, please do.  Let me know about your
#      improvements, and I'll post them or link to them.
#      http://www.quesera.com/reynhout/misc/nnw2vienna
#
# Dependencies
#   expat library (from darwinports or expat.sourceforge.net)
#   Perl modules: DBI, DBD::SQLite, and XML::Parser (from CPAN)
#
# Installation Instrux
#   Starting from a default OSX 10.4.3 installion:
#
#   - install XCode Tools from OSX DVD
#
#   - install darwinports
#     curl -O http://darwinports.opendarwin.org/downloads/DarwinPorts-1.1.dmg
#     open DarwinPorts-1.1.dmg
#       (double-click to install)
#     export PATH="$PATH:/opt/local/bin"
#     sudo port -d selfupdate
#
#   - install expat
#     sudo port install expat
#
#   - install XML-Parser
#     sudo perl -MCPAN -e "install XML::Parser"
#     ## (fails, can't find expat libs, so fix LIBPATH or do by hand...)
#     cd ~/.cpan/build/XML-Parser-2.34
#     perl Makefile.PL EXPATLIBPATH=/opt/local/lib \
#       EXPATINCPATH=/opt/local/include
#     sudo make install
#
#   - install DBI and SQLite
#     sudo perl -MCPAN -e "install DBI"
#     sudo perl -MCPAN -e "install DBD::SQLite"
#
#   - script is now ready to run.
#

use DBI;
use XML::Parser;
use Time::Local; 

my $GET_ALL="n";
my $GET_MARKED="y";
my $GET_UNREAD="y";
my $DIRECT_TO_DB="n";
my ($INFILE,$DESTFID);
my $nnwfeeddir="$ENV{'HOME'}/Library/Application Support/NetNewsWire/FeedData";
my $vdbfile="$ENV{'HOME'}/Library/Application Support/Vienna/messages.db";
my ($inkeytag,$currkey,%r);
my ($parser,$dbh,$res,$currvfeedid,%vfeedid,$xmlfile,%vrfeed,%vwfeed);
my $i=0;
my $arraylev=0;
my $dictlev=0;
my @vfields=qw(message_id folder_id parent_id read_flag marked_flag
               deleted_flag title sender link date text);

{
my ($arg,$dum);

while ( $arg=shift )
  {
  if ( $arg eq "--skip-marked" ) { $GET_MARKED="n"; }
  elsif ( $arg eq "--skip-unread" ) { $GET_UNREAD="n"; }
  elsif ( $arg eq "--get-all" ) { $GET_ALL="y"; }
  elsif ( $arg eq "--direct-to-db" ) { $DIRECT_TO_DB="y"; }
  elsif ( $arg =~ /--file=..*/ )
    { ($dum,$INFILE)=split(/=/,$arg); }
  elsif ( $arg =~ /--dest-folder=..*/ )
    { ($dum,$DESTFID)=split(/=/,$arg); }
  else { print STDERR "$0: ERROR: unrecognized argument: $arg\n"; exit 1; }
  }

if ( defined $INFILE xor defined $DESTFID )
  {
  print STDERR "$0: ERROR: must use --file and --dest-folder together\n";
  exit 1;
  }

$parser=new XML::Parser(Style=>'Subs',Pkg=>'SubHandlers',ErrorContext=>2);
$parser->setHandlers(Char  => \&char_handler);

$dbh=DBI->connect("dbi:SQLite:$vdbfile") || die "Cannot connect: $DBI::errstr";

if ( defined $INFILE and defined $DESTFID )
  {
  $currvfeedid=$DESTFID;
  $vrfeed{$INFILE}=$DESTFID;
  if ( ! -r $INFILE ) { die "can\'t read $INFILE"; exit 1; }
  $parser->parsefile("$INFILE");
  }
else
  {
  $res=$dbh->selectall_arrayref("SELECT folder_id,feed_url FROM rss_folders");

  foreach( @$res )
    {
    $_->[1] =~ s/^http:\/\///;
    $_->[1] =~ s/\//_/g;
    $vfeedid{$_->[1]}=$_->[0];
    }

  for $xmlfile (keys %vfeedid)
    {
    undef %vrfeed;
    undef %vwfeed;
  
    $currvfeedid=$vfeedid{$xmlfile};
    print STDERR "--- $xmlfile [id=$currvfeedid]\n";
  
    ## build hash of vienna articles for this feed -- vrfeed{message_id}=date
    $res=$dbh->selectall_arrayref("SELECT message_id,date,folder_id FROM messages WHERE folder_id=$currvfeedid");
    foreach( @$res )
      { $vrfeed{$_->[0]}=$_->[1]; }
  
    if ( ! -r "$nnwfeeddir/$xmlfile" )
      {
      print "-- can\'t read $xmlfile (folder_id $vfeedid{$xmlfile})\n";
      next;
      }
    else
      { print "-- from $xmlfile (folder_id $vfeedid{$xmlfile})\n"; }
    $parser->parsefile("$nnwfeeddir/$xmlfile");
    }

  }
$dbh->disconnect;
}

sub char_handler
  {
  my ($p, $data) = @_;
  if ( $arraylev == 0 or $dictlev > 2 ) { return; }
  if ( defined $inkeytag ) { $currkey=$data; }
  elsif ( defined $currkey )
    {
    if ( defined $r{$currkey} ) { $r{$currkey}.=$data; }
    else { $r{$currkey}=$data; }
    }
  }

sub write_vwfeed_ng
  {
  my ($k);

  ## this results in all values being quoted..NG for numbers.
  my $sqlstr="INSERT INTO messages (".join(', ',@vfields).") VALUES (";
  for (my $x=0;$x<$#vfields;$x++) { $sqlstr .= "?, "; } $sqlstr .= "? )";
  my $sth = $dbh->prepare($sqlstr) or die $dbh->errstr;
  for $k (keys %vwfeed)
    { $sth->execute(@{$vwfeed{$k}}{@vfields}) or die $dbh->errstr; }
  }

sub write_vwfeed
  {
  my ($k,$sqlstr);
  binmode STDOUT, ":utf8";
  binmode STDERR, ":utf8";

  for $k (keys %vwfeed)
    {
    print STDERR "  + $vwfeed{$k}{'title'} ";
    if ($vwfeed{$k}{'read_flag'} != 1) { print STDERR "[UNREAD]"; }
    if ($vwfeed{$k}{'marked_flag'} == 1) { print STDERR "[MARKED]"; }
    print STDERR "\n";

    #$sqlstr =  "INSERT INTO messages VALUES (";
    $sqlstr  = "INSERT INTO messages (".join(', ',@vfields).") VALUES (";
    $sqlstr .= "\'$vwfeed{$k}{'message_id'}\',";
    $sqlstr .= "$vwfeed{$k}{'folder_id'},";
    $sqlstr .= "$vwfeed{$k}{'parent_id'},";
    $sqlstr .= "$vwfeed{$k}{'read_flag'},";
    $sqlstr .= "$vwfeed{$k}{'marked_flag'},";
    $sqlstr .= "$vwfeed{$k}{'deleted_flag'},";
    $sqlstr .= "\'$vwfeed{$k}{'title'}\',";
    $sqlstr .= "\'$vwfeed{$k}{'sender'}\',";
    $sqlstr .= "\'$vwfeed{$k}{'link'}\',";
    $sqlstr .= "$vwfeed{$k}{'date'},";
    $sqlstr .= "\'$vwfeed{$k}{'text'}\'";
    $sqlstr .= ");";

    if ( $DIRECT_TO_DB eq "y" )
      { my $sth=$dbh->prepare($sqlstr); $sth->execute(); }
    else
      { print $sqlstr . "\n"; }
    }
  
  }


sub filter_text
  {
  my $text=shift;
  $text =~ s/\'/\'\'/g;
  $text =~ s/&lt;/</gi;
  $text =~ s/&gt;/>/gi;
  ## TODO more filtering?
  return $text;
  }

sub zdate_to_secs
  {
  ## 2005-09-10T19:21:00Z -> 00 21 19 10 8 105
  my $rawdate;
  my ($d,$t)=split(/[TZ]/,shift);
  my @d=split(/-/,$d); $d[1]--;
  my @t=split(/:/,$t);
  #print STDERR "-- POST $t[2],$t[1],$t[0],$d[2],$d[1],$d[0]\n";
  $rawdate=timelocal($t[2],$t[1],$t[0],$d[2],$d[1],$d[0]);
  return $rawdate;
  }

sub init_r
  {
  $i++;
  %r = ( 'guid'=>'', 'read'=>'', 'flagged'=>'', 'title'=>'',
    'creator'=>'', 'link'=>'', 'datePublished'=>'', 'dateArrived'=>'',
    'description'=>'');
  }

sub add_to_vwfeed
  {
  my $tdate="2000-01-01T07:37:00Z";
  my $getme=0;

  for $key (keys %r)
    { $r{$key} =~ s/^\s*(..*)\s*$/$1/; }

  if ( $r{'datePublished'} ne '' ) { $tdate=$r{'datePublished'}; }
  elsif ( $r{'dateArrived'} ne '' ) { $tdate=$r{'dateArrived'}; }
  $tdate=&zdate_to_secs($tdate);

  if ( $r{'guid'} eq '' ) { $r{'guid'}="nnwxferid$i"; }
  if ( defined $vrfeed{$r{'guid'}} ) { return; }

  if ( $r{'flagged'} eq '' ) { $r{'flagged'}=0; }
  if ( $r{'read'} eq '' ) { $r{'read'}=0; }

  if ( $GET_ALL eq "y" ) { $getme++; }
  else
    {
    if ( $GET_MARKED eq "y" and $r{'flagged'} == 1 ) { $getme++; }
    if ( $GET_UNREAD eq "y" and $r{'read'} == 0 ) { $getme++; }
    }
  return if ( $getme == 0 );

  $vwfeed{$r{'guid'}}{'message_id'}=$r{'guid'};
  $vwfeed{$r{'guid'}}{'folder_id'}=$currvfeedid;
  $vwfeed{$r{'guid'}}{'parent_id'}=0;
  $vwfeed{$r{'guid'}}{'read_flag'}=$r{'read'};
  $vwfeed{$r{'guid'}}{'marked_flag'}=$r{'flagged'};
  $vwfeed{$r{'guid'}}{'deleted_flag'}=0;
  $vwfeed{$r{'guid'}}{'title'}=&filter_text($r{'title'});
  $vwfeed{$r{'guid'}}{'sender'}=&filter_text($r{'creator'});
  $vwfeed{$r{'guid'}}{'link'}=$r{'link'};
  $vwfeed{$r{'guid'}}{'date'}=$tdate;
  $vwfeed{$r{'guid'}}{'text'}=&filter_text($r{'description'});
  }

package SubHandlers;

sub array { $arraylev++; }
sub array_
  { $arraylev--; if ( $arraylev == 0 ) { &::write_vwfeed; } }

sub plist { }
sub plist_ { }

sub key { if ( $arraylev == 0 ) { return; } $inkeytag="1"; }
sub key_ { if ( $arraylev == 0 ) { return; } undef $inkeytag; }

sub false { $r{$currkey}.="0"; }
sub true { $r{$currkey}.="1"; }
sub dict { $dictlev++; if ( $arraylev == 1 ) { &::init_r; } }

sub dict_
  {
  $dictlev--; 
  if ( $arraylev == 1 )
    { &::add_to_vwfeed; }
  }

## 13.5MB SQL data, 10k articles
#  4min to export 
#  60min to import same via < redirect
#  66min to import same via sqlite shell .read
#  73min to import same via --direct-to-db
#  (messages.db goes from 3.7MB to 18.5MB)