#! /usr/local/bin/python2.3
### Sort and group the messages in the Data hierarchy.
### Run this prior to mksets.py for setting stuff up for
### testing of chronological incremental training.
"""Usage: sort+group.py
This program has no options! Muahahahaha!
"""
import sys
import os
import glob
import time
from email.Utils import parsedate_tz, mktime_tz
loud = True
SECONDS_PER_DAY = 24 * 60 * 60
# Scan the file with path fpath for its first Received header, and return
# a UTC timestamp for the date-time it specifies. If anything goes wrong
# (can't find a Received header; can't parse the date), return None.
# This is the best guess about when we received the msg.
def get_time(fpath):
fh = file(fpath, 'rb')
lines = iter(fh)
# Find first Received header.
for line in lines:
if line.lower().startswith("received:"):
break
else:
print "\nNo Received header found."
fh.close()
return None
# Paste on continuation lines, if any.
received = line
for line in lines:
if line[0] in ' \t':
received += line
else:
break
fh.close()
# RFC 2822 says the date-time field must follow a semicolon at the end.
i = received.rfind(';')
if i < 0:
print "\n" + received
print "No semicolon found in Received header."
return None
# We only want the part after the semicolon.
datestring = received[i+1:]
# It may still be split across lines (like "Wed, \r\n\t22 Oct ...").
datestring = ' '.join(datestring.split())
as_tuple = parsedate_tz(datestring)
if as_tuple is None:
print "\n" + received
print "Couldn't parse the date: %r" % datestring
return None
return mktime_tz(as_tuple)
def main():
"""Main program; parse options and go."""
from os.path import join, split
data = [] # list of (time_received, dirname, basename) triples
if loud:
print "Scanning everything"
now = time.time()
for name in glob.glob('Data/*/*/*'):
if loud:
sys.stdout.write("%-78s\r" % name)
sys.stdout.flush()
when_received = get_time(name) or now
data.append((when_received,) + split(name))
if loud:
print ""
print "Sorting ..."
data.sort()
# First rename all the files to a form we can't produce in the end.
# This is to protect against name clashes in case the files are
# already named according to the scheme we use.
if loud:
print "Renaming first pass ..."
for dummy, dirname, basename in data:
os.rename(join(dirname, basename),
join(dirname, "-" + basename))
if loud:
print "Renaming second pass ..."
earliest = data[0][0] # timestamp of earliest msg received
i = 0
for when_received, dirname, basename in data:
extension = os.path.splitext(basename)[-1]
group = int((when_received - earliest) / SECONDS_PER_DAY)
newbasename = "%04d-%06d" % (group, i)
os.rename(join(dirname, "-" + basename),
join(dirname, newbasename + extension))
i += 1
if __name__ == "__main__":
main()
syntax highlighted by Code2HTML, v. 0.9.1