/***********************************************************************
gutenfetch - query and fetch electronic texts from project gutenberg
Copyright (C) 2001, 2002, 2003, 2004 Russell Francis
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
Last updated on $Date: 2004/03/21 05:14:23 $ by $Author: johntabularasa $.
***********************************************************************/
#include "stddefs.h"
#include "libgutenfetch_filter.h"
#ifdef HAVE_STDIO_H
# include <stdio.h>
#endif
#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
# include <string.h>
#endif
#ifdef HAVE_STRINGS_H
# include <strings.h>
#endif
#ifdef HAVE_ASSERT_H
# include <assert.h>
#endif
/* * * * * * * * R E G U L A R E X P R E S S I O N S * * * * * */
/* This is a regular expression that matches a line in GUTINDEX.ALL
* that refers to a particular resource available on the server.
*/
#define FILTER_GUTINDEX_OLD_ENTRY "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{2}([[:digit:]]{2})[[:space:]](.{50})\\[(.{8}\\.[[:alnum:]]{3})\\][[:space:]]+([[:digit:]]+)(A|C|\\*|())"
/* This is a regex that matches a line in GUTINDEX.ALL
* that refers to a particular resource > 10000 on the server.
*/
#define FILTER_GUTINDEX_NEW_ENTRY "^(.{73})([[:digit:]]{5})(A|C|\\*|())"
/* This is a first stab at extracting an author's name from the listing. */
#define FILTER_AUTHOR ",[[:space:]]?by[[:space:]]+((([[:alpha:]]|\\.|\\(|\\))+([[:space:]]|()))+)"
/* This is a first stab at extracting the title from a listing. */
#define FILTER_NEW_TITLE "(.+),[[:space:]]?by[[:space:]]"
#define FILTER_OLD_TITLE "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{4}[[:space:]](.+),[[:space:]]?by[[:space:]]"
/*
* These filters match the ls-lR and ls-R file respectively and allow
* us to glean detailed information about an etext entry such as it's
* size, and the formats it is available in.
*
* This should match either the old directories "etext99" or
* new-style directories "1/0/0/0/10000" and return the directory in
* that form in the 2nd and 3rd element of the matching array provided by
* gutenfetch_filter_match
*/
#define FILTER_LS_LR_DETAIL_DIRECTORY "^\\./(([[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]{5})|(etext[[:digit:]][[:digit:]])):"
/* These should be the same. */
#define FILTER_LS_R_DETAIL_DIRECTORY FILTER_LS_LR_DETAIL_DIRECTORY
#define FILTER_LS_LR_DETAIL_ENTRY "([[:digit:]]+)[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]]{1,2}[[:digit:]]{1,2}[[:space:]]{1,2}([[:digit:]]{4}|[[:digit:]]{2}:[[:digit:]]{2})[[:space:]](.{1,8}\\..{1,3})"
//#define FILTER_LS_LR_DETAIL_ENTRY "^-(.)+([[:digit:]])+[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)([[:space:]]){1,2}([[:digit:]]){1,2}([[:space:]]){1,2}(([[:digit:]]{4})|([[:digit:]]{2}:[[:digit:]]{2}))[[:space:]]((.{1,8})\\.(.{1,3}))"
/* On match ...
* basename will be array[1]
* extension will be array[2]
*/
#define FILTER_FILENAME_BASE_EXT "^(.+)\\.(.{1,3})"
#define FILTER_LS_R_DETAIL_ENTRY FILTER_FILENAME_BASE_EXT
/* * * * * E N D O F R E G U L A R E X P R E S S I O N S * * * * */
// The maximum number of matches to find in a regex.
#define PMATCH_SIZE 32
static gutenfetch_filter_t *ifilter[NUM_OF_IFILTER];
static char *filter_regex[] = {
FILTER_GUTINDEX_OLD_ENTRY,
FILTER_GUTINDEX_NEW_ENTRY,
FILTER_AUTHOR,
FILTER_OLD_TITLE,
FILTER_NEW_TITLE,
FILTER_LS_LR_DETAIL_DIRECTORY,
FILTER_LS_LR_DETAIL_ENTRY,
FILTER_LS_R_DETAIL_DIRECTORY,
FILTER_LS_R_DETAIL_ENTRY,
FILTER_FILENAME_BASE_EXT
};
/**
* gutenfetch_filter_init
*
* Initialize the filter routines.
*
* @return Returns GUTENFETCH_OK on success.
*/
gutenfetch_error_t
gutenfetch_filter_init(void)
{
unsigned int i;
for (i = 0;i < NUM_OF_IFILTER; ++i) {
ifilter[i] = gutenfetch_filter_create(filter_regex[i], TRUE);
}
return GUTENFETCH_OK;
}
/**
* gutenfetch_filter_shutdown
*
* Free any resources held by the filter routines.
*
* @return Returns GUTENFETCH_OK on success.
*/
gutenfetch_error_t
gutenfetch_filter_shutdown(void)
{
unsigned int i;
for (i = 0;i < NUM_OF_IFILTER; ++i) {
gutenfetch_filter_destroy(ifilter[i]);
}
return GUTENFETCH_OK;
}
/**
* gutenfetch_filter_create -
* Create a filter based on regular expressions.
*
* @param pattern A regular expression which the
* filter will match.
* @param sub TRUE if we want to get the values of
* the expression that match the regex, FALSE
* if we are only interested in getting a TRUE/FALSE
* response when we call filter_match.
* @return A pointer to the filter we have created or
* NULL if something went wrong.
*/
gutenfetch_filter_t *
gutenfetch_filter_create(const char *pattern, int sub)
{
int ret;
unsigned int flags = REG_EXTENDED;
gutenfetch_filter_t *filter;
filter = malloc(sizeof(gutenfetch_filter_t));
filter->regex = malloc(sizeof(regex_t));
filter->sub = sub;
if (sub == FALSE)
flags |= REG_NOSUB;
if( (ret = regcomp(filter->regex, pattern, flags)) != 0) {
gutenfetch_filter_destroy(filter);
filter = NULL;
}
return filter;
}
/**
* Release the resources used by a filter.
*
* @param filter The filter we wish to free.
*/
void
gutenfetch_filter_destroy(gutenfetch_filter_t *filter)
{
regfree(filter->regex);
FREE_NULL(filter->regex);
FREE_NULL(filter);
}
/**
* gutenfetch_ifilter_match
*
* Determine if a string matches a predefined regular expression.
*
* @param ifilter_index The unique index of the predefined regular
* expression.
* @param str The string to try to match with the filter.
* @return Returns a list_t* which is NULL if no match was
* made. It is a valid pointer otherwise, which points to the
* matched sub strings in the string.
*/
list_t *
gutenfetch_ifilter_match(int ifilter_index, const char *str)
{
return gutenfetch_filter_match(ifilter[ifilter_index], str);
}
/**
* Determine if a string matches a regular expression.
*
* @param filter The filter to use when matching the string.
* @param str The string to try to match with the regex in filter.
* @return Returns a list_t* which is NULL if no match was made.
* It is a valid pointer otherwise, which points to the matched
* sub strings in str.
*/
list_t *
gutenfetch_filter_match(gutenfetch_filter_t *filter, const char *str)
{
regmatch_t pmatch[PMATCH_SIZE];
list_t *subs = NULL;
size_t sub_string_size;
char *sub_string;
int i, ret, nsubs;
ret = regexec(filter->regex, str, PMATCH_SIZE, pmatch, 0);
if (ret == 0) { // match
if (filter->sub == TRUE) { // get internal matches
nsubs = filter->regex->re_nsub + 1;
for (i = 0;i < nsubs; ++i) {
sub_string_size = pmatch[i].rm_eo - pmatch[i].rm_so;
sub_string = malloc((sizeof(char) * sub_string_size) + 1);
memcpy(sub_string, &str[pmatch[i].rm_so], sub_string_size);
sub_string[sub_string_size] = '\0';
subs = list_append(subs, sub_string);
sub_string = NULL;
}
} else {
sub_string = strdup(str);
if (sub_string != NULL)
subs = list_append(subs, sub_string);
}
} /*else if (ret != REG_NOMATCH) { // We have an error condition
} */
return subs;
}
syntax highlighted by Code2HTML, v. 0.9.1