#!/usr/bin/env perl

#
# Program:
#
#     eudora2unix.pl    (Perl script)
#
# Copyright and Author:
#
#     Copyright (C) 2002  Eric Maryniak
#
#     Eric Maryniak <e.maryniak@pobox.com>
#     WWW homepage: http://pobox.com/~e.maryniak/
#
# Version:
#
#     Last modification: 2002-01-29  (started: 2002-01-16).
#     Number of lines in file: ca. 510.
#
# License:
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# Program Info:
#
#    Convert a Eudora .mbx (mbox) file to Linux/Unix (KMail) format.
#    See the master script 'eudora2unix.sh' that calls this script for
#    all the mailboxes it loops over.
#
#    Based on the eud2unx.pl script kindly provided by Blake Hannaford.
#
#    Usage:
#
#        eudora2unix.pl onefile.mbx
#
#    This programs emits headers when an empty line (^$) is seen, in
#    accordance with rfc822 instead of when a "From: " is encountered
#    as in the original version.
#    En passant, DOS to Unix end-of-line conversion is applied, i.e.
#    a trailing carriage return (^M) on a line is removed.
#    If a "Date: " is missing, it is added to make KMail happy and
#    extracted from the Eudora specific "From ???@???" line.
#    The "From " (not "From: ") is extracted from the first encountered
#    "From: " or else "Sender: " or else "Return-Path: ".
#    If no sender is found, then "unknown@unknown.unknown" is used for
#    easy later grepping.
#
# End of Program Info.
#


use FileHandle;

# Configuration.
# Verbosity.
# Determines if subroutines {log,warn,err}_msg send output to STDOUT, too:
#
#     $verbose = -1;  # ultra quiet: not even the mailbox's message total
#     $verbose =  0;  # really quiet
#     $verbose =  1;  # errors only
#     $verbose =  2;  # warnings and errors only
#     $verbose =  3;  # logging, warnings and errors
#

$verbose = 0;

# Add a 'X-Eudora2Unix: <ISO 8601 date> converted' header at the end
# of the emitted headers (see sub emit_headers),  0=no, 1=yes.
# This can come in handy later to differentiate between 'new' KMail
# messages and those inherited from the conversion.

$emit_X_Eudora2Unix_Header = 1;

# End of configuration.

# Program name and title in various banners.
($P = $0) =~ s@.*/@@;

# File argument (must be exactly 1).
$ifilename = $ARGV[0];
$mbx       = "$ifilename";  # a shorter name for convenience.

if ($mbx eq "") {
    print STDERR "$P: usage: eudora2unix.pl one-eudora-mailbox-file.mbx\n";
    exit 1;
}

open (INPUT, $mbx) or die ("$P: cannot open \"$mbx\"");

$ofilename = "$mbx.OUT";   # Output file
$lfilename = "$mbx.LOG";   # Log file
$wfilename = "$mbx.WARN";  # Warning file
$efilename = "$mbx.ERR";   # Error file

open (OUTPUT,  ">" . $ofilename) or die ("$P: cannot open \"$ofilename\"");
open (LOGOUT,  ">" . $lfilename) or die ("$P: cannot open \"$lfilename\"");
open (WARNOUT, ">" . $wfilename) or die ("$P: cannot open \"$wfilename\"");
open (ERROUT,  ">" . $efilename) or die ("$P: cannot open \"$efilename\"");

# Initialize state engine.
# Start at the Eudora specific pattern "^From ???@???" and keep gathering
# all headers. When an empty line (^$) is found, emit the headers.
# Replace ???@??? with the e-mail address in "From: " (if found), or else
# use "Sender: " or else "Return-Path: " or else use the magic address
# "unknown@unknown.unknown" for later analysis (easily greppable).
# Also add a "Date: " if missing (mostly for outbound mail sent by yourself)
# which is extracted from the Eudora "From ???@??? ..." line, but is
# reformatted, because Eudora has the order in the datum items wrong.
# Eudora uses:
#     dayname-abbr monthname-abbr monthnumber-nn time-hh:mm:ss year-yyyy
# whereas it should be:
#     dayname-abbr monthnumber-nn monthname-abbr year-yyyy time-hh:mm:ss
# Example:
#     Thu 03 Jan 2002 11:42:42    (24 characters)
#

$state    = 0;   # in "Headers: " (1) or not (0)
@hdr_line = ();  # header lines gathered in current message
$hdr_from = "";  # encountered a "From:" and stored it
$hdr_send = "";  # encountered a "Sender:" and stored it
$hdr_retp = "";  # encountered a "Return-Path:" and stored it
$hdr_date = "";  # encountered a "Date: " field and stored it

$n_msg    = 0;   # number of messages in this mailbox
$n_line   = 0;   # line number of current line record (for messages)

$exit_code = 0;  # exit code: 0 if all ok, 1 if any warnings or errors

#
# Help functions.
#

$last_l_msg = 1;  # remember message number of last message printed
$last_w_msg = 1;
$last_e_msg = 1;
$n_log_msg  = 0;  # remember the number of messages printed
$n_warn_msg = 0;
$n_err_msg  = 0;

sub log_msg {
    local $msg = "";
    $msg .= "\n" if ($n_msg > $last_l_msg && $n_log_msg != 0);
    $last_l_msg = $n_msg;
    # Newline has been prepended if this was an new mailbox message.
    $msg .= "$mbx (msg #$n_msg, line ~#$n_line):\n  $_[0]\n";
    print STDOUT $msg if ($verbose >= 3);
    print LOGOUT $msg;
    $n_log_msg++;
}
sub warn_msg {
    local $msg = "";
    $msg .= "\n" if ($n_msg > $last_w_msg && $n_warn_msg != 0);
    $last_w_msg = $n_msg;
    # Newline has been prepended if this was an new mailbox message.
    $msg .= "$mbx (msg #$n_msg, line ~#$n_line):\n  $_[0]\n";
    print STDERR  $msg if ($verbose >= 2);
    print WARNOUT $msg;
    $n_warn_msg++;
    $exit_code = 1;
}
sub err_msg {
    local $msg = "";
    $msg .= "\n" if ($n_msg > $last_e_msg && $n_err_msg != 0);
    $last_e_msg = $n_msg;
    # Newline has been prepended if this was an new mailbox message.
    $msg .= "$mbx (msg #$n_msg, line ~#$n_line):\n  $_[0]\n";
    print STDERR $msg if ($verbose >= 1);
    print ERROUT $msg;
    $n_err_msg++;
    $exit_code = 1;
}

sub iso_8601_zulu {
    # Return a string with the date and time in ISO 8601 form for
    # 'timezone' Zulu, ie. UTC +00:00, or zero-meridian and formerly
    # known as 'GMT', or 'Greenwich Mean Time'.
    # See http://www.cl.cam.ac.uk/~mgk25/iso-time.html for more info on
    # on the ISO International Standard Date and Time Notation (ISO 8601).
    # Example: for February 28, 2002 14:42:42 UTC+01:00, this string:
    #     2002-02-28T13:42:42Z
    # will be returned and showes the date and time in 'timezone' Zulu.
    # Note the one (1) hour difference in this particular example (because
    # it is in UTC +01:00), the 'T' separator between date and time and the
    # 'Z' (Zulu) designator for the UTC +00:00 'timezone'.
    # CVS, a popular source code and document versioning program, also uses
    # Zulu time.

    ($gm_second,   $gm_minute,  $gm_hour,
     $gm_monthday, $gm_month,   $gm_year,
     $gm_weekday,  $gm_yearday, $gm_isdst) = gmtime(time);

    local $iso_8601_zulu_datetime =
        sprintf ("%04d-%02d-%02dT%02d:%02d:%02dZ",
                 $gm_year+1900, $gm_month+1, $gm_monthday,
                 $gm_hour,      $gm_minute,   $gm_second);

    return $iso_8601_zulu_datetime;
}

sub emit_headers {
    # Uses global variables (yes, i know) @hdr_line, $hdr_date,
    # $hdr_{from,retp,send}, $n_line and $n_msg in read-only mode.
    # The first, $hdr_line[0], is assumed to be the Eudora specific
    # "^From ???@???" line, and a date is extracted from it, unless
    # $hdr_date is not empty (i.e. a "Date: " is present, which is
    # the common case). If the former case, the extracted date is
    # reformatted (see program introduction) and a properly rewritten
    # "Date: " is appended directly under the "From ".
    # The "???@???" is replaced with (if set) $hdr_from.
    # Note that $hdr_from (and family) and $hdr_date have already
    # the "From:" and "Date:" part removed.
    # The flag variable $emit_X_Eudora2Unix_Header is used to determine
    # if a 'X-Eudora2Unix: ' should be added. It looks like:
    #     X-Eudora2Unix: 2002-02-28T13:42:42Z converted
    # where the date and time (for timezone UTC +00:00, formerly known
    # as 'GMT') is in ISO 8601 format (in the so called extended format,
    # to be precize). It shows the time and date of conversion of this
    # specific message in Zulu (zero-meridian) ISO 8601 date and time.
    # See http://www.cl.cam.ac.uk/~mgk25/iso-time.html for more info on
    # on the ISO International Standard Date and Time Notation.

    $hdr_line0 = $hdr_line[0];  # still has \n
    $new_date  = "";

    if ($hdr_date eq "") {
        # Extract a date from Eudora's "From ???@??? " (with checks).
        $extr_hdr_date = substr($hdr_line0, 13);  # still has \n
        chomp ($extr_hdr_date);  # \n removed
        @d = split ' ', "$extr_hdr_date";  # 5 date items (0..4), 24 chars
        $len_xhdate = length ($extr_hdr_date);
        if ($len_xhdate != 24) {
            $msg = "bad date (len=$len_xhdate != 24): $extr_hdr_date";
            &log_msg ("$msg");
            &err_msg ("$msg");
        }
        $n_date_elem = $#d + 1;
        if ($n_date_elem != 5) {
            $msg = "bad date (n=$n_date_elem != 5): $extr_hdr_date";
            &log_msg ("$msg");
            &err_msg ("$msg");
        }
        $new_date = "Date: $d[0] $d[2] $d[1] $d[4] $d[3]";  # \n added later
        $msg = "NO  date field, added    [$new_date]";
        &warn_msg ("$msg");
    }

    # Determine sender's address from "From:" and fall back on
    # "Sender:" and "Return-Path:", respectively.
    $new_from = $hdr_from;  # still has \n
    chomp ($new_from);      # \n removed

    $new_from = $hdr_send if ($new_from eq "");
    $new_from = $hdr_retp if ($new_from eq "");
    if ($new_from eq "") {
        $new_from = "unknown\@unknown.unknown";
        $msg = "NO  sender field, used   [$new_from]";
        &log_msg ("$msg");
        &err_msg ("$msg");
    } else {
        $msg = "had sender field, used   [$new_from]";
        &log_msg ("$msg");
    }

    # Extract an e-mail address from $new_from with a _greedy_ match,
    # if it matches on <...>, i.e. use the question mark (?) in (.*?).
    # This ensures that after a "<", the first ">" will be matched.
    # Especially Return-Path's can have multiple e-mail addresses.
    # If there are parentheses, the e-mail address is usually outside it,
    # so simply remove it.
    if ($new_from =~ m/<(.*?)>/) {
        $email_address = $1;
    } elsif ($new_from =~ m/\(.*?\)/) {
        $email_address = $new_from;
        $email_address =~ s/\(.*\)//;
    } else {
        $email_address = $new_from;
    }
    $msg = "e-mail address extracted <$email_address>";
    &log_msg ("$msg");

    $hdr_line0  =~ s/\?\?\?\@\?\?\?/$email_address/;  # still has \n
    $hdr_line0 .=  $new_date."\n" if ($hdr_date eq "");

    # Add a 'X-Eudora2Unix: ' header (if $emit_X_Eudora2Unix_Header is true).
    # This header looked like (example: February 28, 2002 14:42:42 UTC+01:00):
    #     X-Eudora2Unix: 2002-02-28T14:42:42+01:00 converted
    # and showed the date and time of conversion of this specific message.
    # It required GNU's date that supports the %Y and %z format specifier.
    # Alternatively, the Date::Manip perl package could be used, but this
    # is not always installed.
    # However, now the header has been changed to Zulu time (UTC +00:00),
    # using Perl's built-in 'gmtime(time)'.
    # This header looks like (example: February 28, 2002 14:42:42 UTC+01:00):
    #     X-Eudora2Unix: 2002-02-28T13:42:42Z converted
    # and showes the date and time of conversion of this specific message in
    # Zulu time (just like CVS). Note the one (1) hour difference in this
    # particular example.
    $hdr_lineX = "";
    if ($emit_X_Eudora2Unix_Header == 1) {
        # Begin of new time code with Perl's built-in gmtime(time).
        #
        # Get Zulu time with gmtime(time) and 'correct' month day (0..11)
        # and year (has 1900 substracted from it).
        # See also sub iso_8601_zulu(), provided for pleasure but not used
        # here, as again, a function call proved to perform worse by a factor
        # of 2 to 3 as compared to using the code directly.

        ($gm_second,   $gm_minute,  $gm_hour,
         $gm_monthday, $gm_month,   $gm_year,
         $gm_weekday,  $gm_yearday, $gm_isdst) = gmtime(time);

        $utc_zulu_iso_date =
            sprintf ("%04d-%02d-%02dT%02d:%02d:%02dZ",
                     $gm_year+1900, $gm_month+1, $gm_monthday,
                     $gm_hour,      $gm_minute,   $gm_second);

        $hdr_lineX .= "X-Eudora2Unix: $utc_zulu_iso_date converted\n";

        #
        # End of new time code with Perl's built-in gmtime(time).

        # Previously this old time code was used, using Gnu's date (with %z).
        # It is correct but almost 10 times slower, when compared to gmtime()!
        # Begin of old time code with Gnu's date.
        #
        #chomp ($iso_date = `date '+%Y-%m-%dT%H:%M:%S%z' 2>/dev/null`);
        ## The GNU date's %z specifier is not ISO 8601 compliant: the output
        ## looks like +0100, whereas it should be +01:00.
        #if (length ($iso_date) == 24) {
        #    # Using substr as an lvalue.
        #    substr ($iso_date, -2, 0) = ":";
        #}
        #if (length ($iso_date) == 25) {
        #    $hdr_lineX .= "X-Eudora2Unix: $iso_date converted\n";
        #} else {
        #    # The $iso_date variable is empty or has a bad value.
        #    # Probably a defunct or non-Gnu compatible version of the
        #    # Unix 'date' command. Do not try anymore in the future.
        #    $emit_X_Eudora2Unix_Header = 0;
        #}
        #
        # End of old time code with Gnu's date.
    }

    # Now emit all the headers.
    print OUTPUT "\n" unless ($n_msg == 1);  # blank line begins Unix mail
    print OUTPUT $hdr_line0;
    for ($i = 1; $i <= $#hdr_line; $i++) {
        print OUTPUT $hdr_line[$i];
    }
    print OUTPUT $hdr_lineX if ($hdr_lineX ne "");

    # It is _not_ necessary to emit an extra newline (after which the
    # Body of the message comes), because the caller takes care of that.
}


#
# Main loop, that reads the mailbox file and converts.
#

print STDOUT  "$mbx:\n"  unless ($verbose < 0);


while (<INPUT>) {

    $n_line++;
    s/\r\n$/\n/;  # chop any carriage return (i.e. \r or ^M: a DOS end-of-line)

    # Important first test:
    if ($state == 1 && m/^From \?\?\?\@\?\?\?/) {
        #
        # We have a "From ???@???" line while already _in_ the headers.
        # This should not happen. The previous message is probably empty
        # and does not have the required empty line (^$) to terminate
        # the message headers and start the message body.
        # Emit this as a message and set state to 0, i.e. ready for the
        # next if statement to pick the current message up.
        #
        &err_msg ("From \?\?\?\@\?\?\? encountered while already in message");
        &emit_headers;
        $n_msg++;
        @hdr_line = ();
        $hdr_from = "";
        $hdr_retp = "";
        $hdr_send = "";
        $hdr_date = "";
        $state    = 0;
    }

    # Regular message start detection.
    if ($state == 0 && m/^From \?\?\?\@\?\?\?/) {
        #
        # Bingo, we're in a message. Set state variable appropiately.
        # The @hdr_line, $hdr_{from,date} variables are already cleared.
        #
        $n_msg++;
        $state = 1;
    } elsif ($state == 0) {
        # Message body, simply output the line.
        print OUTPUT ( $_ );
    }

    # Check if end of Header:'s.
    if ($state == 1 && m/^$/) {
        # End of message headers. Emit them and go to the next line.
        &emit_headers;
        print OUTPUT "\n";  # separates headers and body of message text
        @hdr_line = ();
        $hdr_from = "";
        $hdr_retp = "";
        $hdr_send = "";
        $hdr_date = "";
        $state    = 0;
    }

    # Message header processing.
    if ($state == 1 && ! m/^$/) {

        push (@hdr_line, $_);

        # Handle "Date: " and "From: " fields specially.
        # Keep the first encountered non-empty value in $hdr_date and
        # $hdr_from, thus ignoring subsequent Date:'s and From:'s, that
        # are, however, still added to @hdr_line in order to keep the
        # original (albeit malformed) message as intact as possible.
        # Use "Sender: " and "Return-Path: " as a backup for "From: ".
        # Try to extract a "Date:" unless we already have one.
        # Only keep them in @hdr_line, if they are not empty.
        # In other words, pop them away again, if there are empty.

        if (m/^Date: / && $hdr_date eq "") {
            $hdr_date = substr($_, 6);
            pop (@hdr_line) if ($hdr_date eq "");
        } elsif (m/^Date:/ && $hdr_date eq "") {
            $hdr_date = substr($_, 5);
            pop (@hdr_line) if ($hdr_date eq "");
        }

        # Try to extract a "From: " unless we already have one.
        # Only keep them in @hdr_line, if they are not empty.
        # In other words, pop them away again, if there are empty.

        if (m/^From: / && $hdr_from eq "") {
            $hdr_from = substr($_, 6);
            pop (@hdr_line) if ($hdr_from eq "");
        } elsif (m/^From:/ && $hdr_from eq "") {
            $hdr_from = substr($_, 5);
            pop (@hdr_line) if ($hdr_from eq "");
        }
        if (m/^Sender: / && $hdr_send eq "") {
            $hdr_send = substr($_, 8);
            pop (@hdr_line) if ($hdr_send eq "");
        } elsif (m/^Sender:/ && $hdr_send eq "") {
            $hdr_send = substr($_, 7);
            pop (@hdr_line) if ($hdr_send eq "");
        }
        if (m/^Return-Path: / && $hdr_retp eq "") {
            $hdr_retp = substr($_, 13);
            pop (@hdr_line) if ($hdr_retp eq "");
        } elsif (m/^Return-Path:/ && $hdr_retp eq "") {
            $hdr_retp = substr($_, 12);
            pop (@hdr_line) if ($hdr_retp eq "");
        }
    }
}

# Check if the file isn't empty and any messages have been processed.
if ($n_line == 0) {
    &warn_msg ("empty file, so no messages present");
}
if ($n_msg == 0 && $n_line != 0) {
    &err_msg ("non-empty file, but no messages (not a Eudora mailbox file?)");
}

# For debugging and comparison with a:
#
#    'grep "^From ???@???" file.mbx | wc -l | awk '{ print $1 }'
#
#log_msg ("total number of message(s): $n_msg\n");

$msg_str       = "";
$msg_str      .= "no messages"             if ($n_msg == 0);
$msg_str      .= "total: 1 message"        if ($n_msg == 1);
$msg_str      .= "total: $n_msg messages"  if ($n_msg  > 1);

$warn_err_str  = "";
$warn_err_str .= "no warnings"           if ($n_warn_msg == 0);
$warn_err_str .= "1 warning"             if ($n_warn_msg == 1);
$warn_err_str .= "$n_warn_msg warnings"  if ($n_warn_msg  > 1);
$warn_err_str .= ", ";
$warn_err_str .= "no errors"             if ($n_err_msg  == 0);
$warn_err_str .= "1 error"               if ($n_err_msg  == 1);
$warn_err_str .= "$n_err_msg errors"     if ($n_err_msg   > 1);

print STDOUT  "    $msg_str ($warn_err_str)\n"  unless ($verbose < 0);

# Finish up. Close failures usually indicate filesystem full.
close (INPUT)   or die ("$P: cannot close \"$ifilename\"");
close (OUTPUT)  or die ("$P: cannot close \"$ofilename\"");
close (LOGOUT)  or die ("$P: cannot close \"$lfilename\"");
close (WARNOUT) or die ("$P: cannot close \"$wfilename\"");
close (ERROUT)  or die ("$P: cannot close \"$efilename\"");

exit $exit_code;

# End of Perl script: eudora2unix.pl

