X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FMsgTime.pm;h=8eee9a7586922c0fb302421c4c195b4540793fd2;hb=95bdac7f09c69036efed537a4d03d5bdd2ae4eb6;hp=c67a41fff2effa6b2bfbc163040936350f5a4d8c;hpb=cfb8d16578e7f2f2e300f9f436205e4a8fc7f322;p=public-inbox.git diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index c67a41ff..8eee9a75 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -1,23 +1,134 @@ -# Copyright (C) 2018 all contributors +# Copyright (C) 2018-2020 all contributors # License: AGPL-3.0+ + +# Various date/time-related functions package PublicInbox::MsgTime; use strict; use warnings; use base qw(Exporter); our @EXPORT_OK = qw(msg_timestamp msg_datestamp); -use Date::Parse qw(str2time); -use Time::Zone qw(tz_offset); +use Time::Local qw(timegm); +my @MoY = qw(january february march april may june + july august september october november december); +my %MoY; +@MoY{@MoY} = (0..11); +@MoY{map { substr($_, 0, 3) } @MoY} = (0..11); + +my %OBSOLETE_TZ = ( # RFC2822 4.3 (Obsolete Date and Time) + EST => '-0500', EDT => '-0400', + CST => '-0600', CDT => '-0500', + MST => '-0700', MDT => '-0600', + PST => '-0800', PDT => '-0700', + UT => '+0000', GMT => '+0000', Z => '+0000', + + # RFC2822 states: + # The 1 character military time zones were defined in a non-standard + # way in [RFC822] and are therefore unpredictable in their meaning. +); +my $OBSOLETE_TZ = join('|', keys %OBSOLETE_TZ); + +sub str2date_zone ($) { + my ($date) = @_; + my ($ts, $zone); + + # RFC822 is most likely for email, but we can tolerate an extra comma + # or punctuation as long as all the data is there. + # We'll use '\s' since Unicode spaces won't affect our parsing. + # SpamAssassin ignores commas and redundant spaces, too. + if ($date =~ /(?:[A-Za-z]+,?\s+)? # day-of-week + ([0-9]+),?\s+ # dd + ([A-Za-z]+)\s+ # mon + ([0-9]{2,4})\s+ # YYYY or YY (or YYY :P) + ([0-9]+)[:\.] # HH: + ((?:[0-9]{2})|(?:\s?[0-9])) # MM + (?:[:\.]((?:[0-9]{2})|(?:\s?[0-9])))? # :SS + \s+ # a TZ offset is required: + ([\+\-])? # TZ sign + [\+\-]* # I've seen extra "-" e.g. "--500" + ([0-9]+|$OBSOLETE_TZ)(?:\s|$) # TZ offset + /xo) { + my ($dd, $m, $yyyy, $hh, $mm, $ss, $sign, $tz) = + ($1, $2, $3, $4, $5, $6, $7, $8); + # don't accept non-English months + defined(my $mon = $MoY{lc($m)}) or return; + + if (defined(my $off = $OBSOLETE_TZ{$tz})) { + $sign = substr($off, 0, 1); + $tz = substr($off, 1); + } + + # Y2K problems: 3-digit years, follow RFC2822 + if (length($yyyy) <= 3) { + $yyyy += 1900; + + # and 2-digit years from '09 (2009) (0..49) + $yyyy += 100 if $yyyy < 1950; + } + + $ts = timegm($ss // 0, $mm, $hh, $dd, $mon, $yyyy); -sub zone_clamp ($) { - my ($zone) = @_; - $zone ||= '+0000'; + # 4-digit dates in non-spam from 1900s and 1910s exist in + # lore archives + return if $ts < 0; + + # Compute the time offset from [+-]HHMM + $tz //= 0; + my ($tz_hh, $tz_mm); + if (length($tz) == 1) { + $tz_hh = $tz; + $tz_mm = 0; + } elsif (length($tz) == 2) { + $tz_hh = 0; + $tz_mm = $tz; + } else { + $tz_hh = $tz; + $tz_hh =~ s/([0-9]{2})\z//; + $tz_mm = $1; + } + while ($tz_mm >= 60) { + $tz_mm -= 60; + $tz_hh += 1; + } + $sign //= '+'; + my $off = $sign . ($tz_mm * 60 + ($tz_hh * 60 * 60)); + $ts -= $off; + $sign = '+' if $off == 0; + $zone = sprintf('%s%02d%02d', $sign, $tz_hh, $tz_mm); + + # Time::Zone and Date::Parse are part of the same distibution, + # and we need Time::Zone to deal with tz names like "EDT" + } elsif (eval { require Date::Parse }) { + $ts = Date::Parse::str2time($date); + return undef unless(defined $ts); + + # off is the time zone offset in seconds from GMT + my ($ss,$mm,$hh,$day,$month,$year,$off) = + Date::Parse::strptime($date); + return undef unless(defined $off); + + # Compute the time zone from offset + my $sign = ($off < 0) ? '-' : '+'; + my $hour = abs(int($off / 3600)); + my $min = ($off / 60) % 60; + + # deal with weird offsets like '-0420' properly + $min = 60 - $min if ($min && $off < 0); + + $zone = sprintf('%s%02d%02d', $sign, $hour, $min); + } else { + warn "Date::Parse missing for non-RFC822 date: $date\n"; + return undef; + } + + # Note: we've already applied the offset to $ts at this point, + # but we want to keep "git fsck" happy. # "-1200" is the furthest westermost zone offset, # but git fast-import is liberal so we use "-1400" if ($zone >= 1400 || $zone <= -1400) { warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; $zone = '+0000'; } - $zone; + [$ts, $zone]; } sub time_response ($) { @@ -28,37 +139,30 @@ sub time_response ($) { sub msg_received_at ($) { my ($hdr) = @_; # Email::MIME::Header my @recvd = $hdr->header_raw('Received'); - my ($ts, $zone); + my ($ts); foreach my $r (@recvd) { - $zone = undef; - $r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+ - \d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next; - $zone = $2; - $ts = eval { str2time($1) } and last; + $r =~ /\s*([0-9]+\s+[a-zA-Z]+\s+[0-9]{2,4}\s+ + [0-9]+[^0-9][0-9]+(?:[^0-9][0-9]+) + \s+([\+\-][0-9]+))/sx or next; + $ts = eval { str2date_zone($1) } and return $ts; my $mid = $hdr->header_raw('Message-ID'); warn "no date in $mid Received: $r\n"; } - defined $ts ? [ $ts, zone_clamp($zone) ] : undef; + undef; } sub msg_date_only ($) { my ($hdr) = @_; # Email::MIME::Header my @date = $hdr->header_raw('Date'); - my ($ts, $zone); + my ($ts); foreach my $d (@date) { - $zone = undef; - # Y2K problems: 3-digit years - $d =~ s!([A-Za-z]{3}) (\d{3}) (\d\d:\d\d:\d\d)! - my $yyyy = $2 + 1900; "$1 $yyyy $3"!e; - $ts = eval { str2time($d) }; + $ts = eval { str2date_zone($d) } and return $ts; if ($@) { my $mid = $hdr->header_raw('Message-ID'); warn "bad Date: $d in $mid: $@\n"; - } elsif ($d =~ /\s+([\+\-]\d+)\s*\z/) { - $zone = $1; } } - defined $ts ? [ $ts, zone_clamp($zone) ] : undef; + undef; } # Favors Received header for sorting globally