ports//devel/libgutenfetch/work/libgutenfetch-1.2/src/libgutenfetch

/***********************************************************************
	gutenfetch - query and fetch electronic texts from project gutenberg
    Copyright (C) 2001, 2002, 2003, 2004 Russell Francis 

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
	USA

	Last updated on $Date: 2004/03/21 05:14:23 $ by $Author: johntabularasa $.
***********************************************************************/
#include "stddefs.h"
#include "libgutenfetch_filter.h"
#ifdef HAVE_STDIO_H
#	include <stdio.h>
#endif
#ifdef HAVE_STDLIB_H
#	include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#	include <string.h>
#endif
#ifdef HAVE_STRINGS_H
#	include <strings.h>
#endif
#ifdef HAVE_ASSERT_H
#	include <assert.h>
#endif


/* * * * * * * *  R E G U L A R   E X P R E S S I O N S  * * * * * */


/* This is a regular expression that matches a line in GUTINDEX.ALL
 * that refers to a particular resource available on the server.
 */
#define FILTER_GUTINDEX_OLD_ENTRY "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{2}([[:digit:]]{2})[[:space:]](.{50})\\[(.{8}\\.[[:alnum:]]{3})\\][[:space:]]+([[:digit:]]+)(A|C|\\*|())"

/* This is a regex that matches a line in GUTINDEX.ALL
 * that refers to a particular resource > 10000 on the server.
 */
#define FILTER_GUTINDEX_NEW_ENTRY "^(.{73})([[:digit:]]{5})(A|C|\\*|())"

/* This is a first stab at extracting an author's name from the listing. */
#define FILTER_AUTHOR ",[[:space:]]?by[[:space:]]+((([[:alpha:]]|\\.|\\(|\\))+([[:space:]]|()))+)"

/* This is a first stab at extracting the title from a listing. */
#define FILTER_NEW_TITLE "(.+),[[:space:]]?by[[:space:]]"
#define FILTER_OLD_TITLE "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{4}[[:space:]](.+),[[:space:]]?by[[:space:]]"

/*
 * These filters match the ls-lR and ls-R file respectively and allow
 * us to glean detailed information about an etext entry such as it's
 * size, and the formats it is available in.
 *
 * This should match either the old directories "etext99" or 
 * new-style directories "1/0/0/0/10000" and return the directory in
 * that form in the 2nd and 3rd element of the matching array provided by
 * gutenfetch_filter_match
 */

#define FILTER_LS_LR_DETAIL_DIRECTORY "^\\./(([[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]{5})|(etext[[:digit:]][[:digit:]])):"

/* These should be the same. */
#define FILTER_LS_R_DETAIL_DIRECTORY FILTER_LS_LR_DETAIL_DIRECTORY

#define FILTER_LS_LR_DETAIL_ENTRY "([[:digit:]]+)[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]]{1,2}[[:digit:]]{1,2}[[:space:]]{1,2}([[:digit:]]{4}|[[:digit:]]{2}:[[:digit:]]{2})[[:space:]](.{1,8}\\..{1,3})"
//#define FILTER_LS_LR_DETAIL_ENTRY "^-(.)+([[:digit:]])+[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)([[:space:]]){1,2}([[:digit:]]){1,2}([[:space:]]){1,2}(([[:digit:]]{4})|([[:digit:]]{2}:[[:digit:]]{2}))[[:space:]]((.{1,8})\\.(.{1,3}))"

/* On match ...
 * basename will be array[1]
 * extension will be array[2]
 */
#define FILTER_FILENAME_BASE_EXT "^(.+)\\.(.{1,3})"
#define FILTER_LS_R_DETAIL_ENTRY FILTER_FILENAME_BASE_EXT

/* * * * * E N D  O F  R E G U L A R   E X P R E S S I O N S * * * * */


// The maximum number of matches to find in a regex.
#define PMATCH_SIZE	32

static gutenfetch_filter_t *ifilter[NUM_OF_IFILTER];

static char *filter_regex[] = {
	FILTER_GUTINDEX_OLD_ENTRY,
	FILTER_GUTINDEX_NEW_ENTRY,
	FILTER_AUTHOR,
	FILTER_OLD_TITLE,
	FILTER_NEW_TITLE,
	FILTER_LS_LR_DETAIL_DIRECTORY,
	FILTER_LS_LR_DETAIL_ENTRY,
	FILTER_LS_R_DETAIL_DIRECTORY,
	FILTER_LS_R_DETAIL_ENTRY,
	FILTER_FILENAME_BASE_EXT
};	

/**
 * gutenfetch_filter_init
 *
 * Initialize the filter routines.
 *
 * @return Returns GUTENFETCH_OK on success.
 */
gutenfetch_error_t
gutenfetch_filter_init(void)
{
	unsigned int i;

	for (i = 0;i < NUM_OF_IFILTER; ++i) {
		ifilter[i] = gutenfetch_filter_create(filter_regex[i], TRUE);
	}
	return GUTENFETCH_OK;
}


/**
 * gutenfetch_filter_shutdown
 *
 * Free any resources held by the filter routines.
 *
 * @return Returns GUTENFETCH_OK on success.
 */
gutenfetch_error_t
gutenfetch_filter_shutdown(void)
{
	unsigned int i;

	for (i = 0;i < NUM_OF_IFILTER; ++i) {
		gutenfetch_filter_destroy(ifilter[i]);
	}
	return GUTENFETCH_OK;
}

/**
 * gutenfetch_filter_create -
 * 	Create a filter based on regular expressions.
 *
 * @param pattern A regular expression which the
 *		filter will match.
 * @param sub TRUE if we want to get the values of
 *		the expression that match the regex, FALSE 
 *		if we are only interested in getting a TRUE/FALSE
 *		response when we call filter_match.
 * @return A pointer to the filter we have created or
 *		NULL if something went wrong.
 */
gutenfetch_filter_t *
gutenfetch_filter_create(const char *pattern, int sub)
{
	int ret;
	unsigned int flags = REG_EXTENDED;
	gutenfetch_filter_t *filter;

	filter = malloc(sizeof(gutenfetch_filter_t));
	filter->regex = malloc(sizeof(regex_t));
	filter->sub = sub;
	if (sub == FALSE)
		flags |= REG_NOSUB;

	if( (ret = regcomp(filter->regex, pattern, flags)) != 0) {
		gutenfetch_filter_destroy(filter);
		filter = NULL;
	}

	return filter;
}


/**
 * Release the resources used by a filter.
 *
 * @param filter The filter we wish to free.
 */
void
gutenfetch_filter_destroy(gutenfetch_filter_t *filter)
{
	regfree(filter->regex);
	FREE_NULL(filter->regex);
	FREE_NULL(filter);
}

/**
 * gutenfetch_ifilter_match
 *
 * Determine if a string matches a predefined regular expression.
 *
 * @param ifilter_index The unique index of the predefined regular
 *		expression.
 * @param str The string to try to match with the filter.
 * @return Returns a list_t* which is NULL if no match was
 *	made.  It is a valid pointer otherwise, which points to the
 *	matched sub strings in the string.
 */
list_t *
gutenfetch_ifilter_match(int ifilter_index, const char *str)
{
	return gutenfetch_filter_match(ifilter[ifilter_index], str);
}


/**
 * Determine if a string matches a regular expression.
 *
 * @param filter The filter to use when matching the string.
 * @param str The string to try to match with the regex in filter.
 * @return Returns a list_t* which is NULL if no match was made.
 * 		It is a valid pointer otherwise, which points to the matched
 *		sub strings in str.
 */
list_t *
gutenfetch_filter_match(gutenfetch_filter_t *filter, const char *str)
{
	regmatch_t pmatch[PMATCH_SIZE];
	
	list_t *subs = NULL;
	size_t sub_string_size;
	char *sub_string;
	int i, ret, nsubs;

	ret = regexec(filter->regex, str, PMATCH_SIZE, pmatch, 0);
		
	if (ret == 0) { // match
		if (filter->sub == TRUE) { // get internal matches
			nsubs = filter->regex->re_nsub + 1;
			for (i = 0;i < nsubs; ++i) {
				sub_string_size = pmatch[i].rm_eo - pmatch[i].rm_so;
				sub_string = malloc((sizeof(char) * sub_string_size) + 1);
				memcpy(sub_string, &str[pmatch[i].rm_so], sub_string_size);
				sub_string[sub_string_size] = '\0';
				subs = list_append(subs, sub_string);
				sub_string = NULL;
			}	
		} else {
			sub_string = strdup(str);
			if (sub_string != NULL)
				subs = list_append(subs, sub_string);
		}
	} /*else if (ret != REG_NOMATCH) { // We have an error condition 
	} */
		
	return subs;
}
syntax highlighted by Code2HTML, v. 0.9.1