ports//mail/zmailer/work/zmailer-2.99.56/contrib/spamlist.py

#!/usr/bin/python
# Emacs: use -*-Python-*- mode.
#
# Z-mailer spam list maker
#
# Roy Bixler
# rcb@press-gopher.uchicago.edu
# 1 Dec. 1997
# Jun 1998 - modified to accept IP blocks to be banned and to get sources of
#            blacklist from a text file (default: 'spamlist_sources')
#

import ftplib, httplib, re, string, sys
from urlparse import *

# returns the contents at the given URL (must be either of type "http" or
# "ftp") as a list
def get_url_contents(url):
    global lns
    lns = []
    url_comps = urlparse(url)
    if (url_comps[0] == "file"):
	f = open(url_comps[2])
	ln = f.readline()
	while ln:
	    lns.append(string.rstrip(ln))
	    ln = f.readline()
	f.close()
    elif (url_comps[0] == "ftp"):
	def ftp_line(ln):
	    lns.append(ln)
	h = ftplib.FTP(url_comps[1])
	h.login()
	i = string.rfind(url_comps[2], '/')
	if (i >= 0):
	    h.cwd(url_comps[2][:i])
	    h.retrlines("RETR "+url_comps[2][i+1:], ftp_line)
	else:
	    h.retrlines("RETR "+url_comps[2], ftp_line)
	h.close()
    elif (url_comps[0] == "http"):
	h = httplib.HTTP(url_comps[1])
	h.putrequest('GET', url_comps[2])
	h.putheader('Accept', 'text/html')
	h.putheader('Accept', 'text/plain')
	h.endheaders()
	errcode, errmsg, headers = h.getreply()
	# HTTP/1.1 replies seem to generate an errorcode of -1, so try
	# to handle this case.  This may simply be a manifestation of
	# a broken Python 1.4 httplib module.  This bug has been fixed
	# with Python version 1.5.
	version = sys.version[0:3]
	if ((version < "1.5") and (errcode == -1)):
	    try:
		real_errcode = string.atoi(string.split(errmsg)[1])
	    except ValueError:
		real_errcode = -1 # yes, it really is bogus :-/
	    sys.stderr.write("%d" % (real_errcode)) # Should be 200
	else:
	    sys.stderr.write("%d" % (errcode)) # Should be 200
            if (errcode == 200):
                f = h.getfile()
                ln = f.readline()
                # once again, try to compensate for broken behavior on HTTP/1.1
                # by eating the header lines which would otherwise show up in
                # the data.  This bug has been fixed with Python version 1.5.
                if ((version < "1.5") and (errcode == -1) and (real_errcode <> -1)):
                    while ((ln) and
                           ((len(ln) > 2) or
                            (ln[0] <> "\r") or (ln[-1] <> "\n"))):
                        ln = f.readline()
                while ln:
                    lns.append(string.rstrip(ln)) # Get the raw HTML
                    ln = f.readline()
                f.close()
    return lns

# if there is not @-sign found, insert at beginning of string
def atify(dom):
    if (string.find(dom, '@') == -1):
	return '@'+dom
    else:
	return dom

# add the information found at 'svc_url' to a list of junk e-mailers.
# The list consists of the dictionary 'jdict'.  'svc_name' is merely used
# for the cosmetic purpose of progress reporting.  'start_after' specifies
# a string which marks the beginning of the list and 'end_before' similarly
# specifies a marker which tells when to stop reading the list.  These are
# both optional parameters.
def add_to_junkers_dict(jdict, svc_name, svc_url, start_after='',
			end_before=''):
    sys.stderr.write("%s: (status = " % (svc_name))
    tdict = get_url_contents(svc_url)
    sys.stderr.write(") - done\n")
    # i - line number counter
    i = 0
    if (start_after):
	while ((i < len(tdict)) and
	       (tdict[i][0:len(start_after)] <> start_after)):
	    i = i+1
        i = i+1
    while (i < len(tdict)):
	if ((end_before) and (tdict[i][0:len(end_before)] == end_before)):
	    break
	if ((tdict[i]) and (tdict[i][0] <> "#")):
            # assume interesting information is in the beginning of the line
            # until the first whitespace character
	    jdict[atify(string.split(tdict[i])[0])] = svc_name
        i = i+1

# and now for the main program

# start with an empty junk list
sl = {}

# open a file containing our list of sources for the blacklisted spammers
# database
if (len(sys.argv) > 1):
    ssf = open(sys.argv[1])
else:
    ssf = open("spamlist_sources")

ssl = ssf.readline()
while (ssl):
    ssl = string.strip(ssl)
    #skip comment lines
    if ((len(ssl) > 0) and (ssl[0] <> '#')):
        # each line is has tab-delimited delimited field describing spam source
        ssi = string.split(ssl, '\t')
        # first two params. (description and URL) are mandatory
        if (len(ssi) > 1):
            bgn_slist_tag = ""
            end_slist_tag = ""
            if (len(ssi) > 2):
                bgn_slist_tag = ssi[2]
                if (len(ssi) > 3):
                    end_slist_tag = ssi[3]
            add_to_junkers_dict(sl, ssi[0], ssi[1], bgn_slist_tag, end_slist_tag)
    ssl = ssf.readline()
ssf.close()

# time to sort and output our dictionary to standard output
ksl = sl.keys()
ksl.sort()

# look for IP addresses
ipv4_net = re.compile("^@[0-9]{1,3}(\.[0-9]{0,3}){0,3}$")
for i in ksl:
    # if an IP address is found, convert to canonical netblock format and ban that
    # netblock from even connecting to smtpserver
    if (ipv4_net.match(i)):
        num_dots = string.count(i, ".")
        if (i[-1] == "."):
            num_dots = num_dots-1
            i = i[1:-1]
        else:
            i = i[1:]
        if ((num_dots >= 0) and (num_dots < 4)):
            for n in range(3, num_dots, -1):
                i = i+".0"
            print "[%s]/%d" % (i, (num_dots+1)*8)
        else:
            print "what's wrong with this? i = %s, num_dots = %d" % (i, num_dots)
    else:
        # just output as-is (we take it to be a banned domain or e-mail address)
        print i
syntax highlighted by Code2HTML, v. 0.9.1