/*********************************************************************** gutenfetch - query and fetch electronic texts from project gutenberg Copyright (C) 2001, 2002, 2003, 2004 Russell Francis This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Last updated on $Date: 2004/03/21 05:14:23 $ by $Author: johntabularasa $. ***********************************************************************/ #include "stddefs.h" #include "libgutenfetch_filter.h" #ifdef HAVE_STDIO_H # include #endif #ifdef HAVE_STDLIB_H # include #endif #ifdef HAVE_STRING_H # include #endif #ifdef HAVE_STRINGS_H # include #endif #ifdef HAVE_ASSERT_H # include #endif /* * * * * * * * R E G U L A R E X P R E S S I O N S * * * * * */ /* This is a regular expression that matches a line in GUTINDEX.ALL * that refers to a particular resource available on the server. */ #define FILTER_GUTINDEX_OLD_ENTRY "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{2}([[:digit:]]{2})[[:space:]](.{50})\\[(.{8}\\.[[:alnum:]]{3})\\][[:space:]]+([[:digit:]]+)(A|C|\\*|())" /* This is a regex that matches a line in GUTINDEX.ALL * that refers to a particular resource > 10000 on the server. */ #define FILTER_GUTINDEX_NEW_ENTRY "^(.{73})([[:digit:]]{5})(A|C|\\*|())" /* This is a first stab at extracting an author's name from the listing. */ #define FILTER_AUTHOR ",[[:space:]]?by[[:space:]]+((([[:alpha:]]|\\.|\\(|\\))+([[:space:]]|()))+)" /* This is a first stab at extracting the title from a listing. */ #define FILTER_NEW_TITLE "(.+),[[:space:]]?by[[:space:]]" #define FILTER_OLD_TITLE "^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]][[:digit:]]{4}[[:space:]](.+),[[:space:]]?by[[:space:]]" /* * These filters match the ls-lR and ls-R file respectively and allow * us to glean detailed information about an etext entry such as it's * size, and the formats it is available in. * * This should match either the old directories "etext99" or * new-style directories "1/0/0/0/10000" and return the directory in * that form in the 2nd and 3rd element of the matching array provided by * gutenfetch_filter_match */ #define FILTER_LS_LR_DETAIL_DIRECTORY "^\\./(([[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]/[[:digit:]]{5})|(etext[[:digit:]][[:digit:]])):" /* These should be the same. */ #define FILTER_LS_R_DETAIL_DIRECTORY FILTER_LS_LR_DETAIL_DIRECTORY #define FILTER_LS_LR_DETAIL_ENTRY "([[:digit:]]+)[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[[:space:]]{1,2}[[:digit:]]{1,2}[[:space:]]{1,2}([[:digit:]]{4}|[[:digit:]]{2}:[[:digit:]]{2})[[:space:]](.{1,8}\\..{1,3})" //#define FILTER_LS_LR_DETAIL_ENTRY "^-(.)+([[:digit:]])+[[:space:]](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)([[:space:]]){1,2}([[:digit:]]){1,2}([[:space:]]){1,2}(([[:digit:]]{4})|([[:digit:]]{2}:[[:digit:]]{2}))[[:space:]]((.{1,8})\\.(.{1,3}))" /* On match ... * basename will be array[1] * extension will be array[2] */ #define FILTER_FILENAME_BASE_EXT "^(.+)\\.(.{1,3})" #define FILTER_LS_R_DETAIL_ENTRY FILTER_FILENAME_BASE_EXT /* * * * * E N D O F R E G U L A R E X P R E S S I O N S * * * * */ // The maximum number of matches to find in a regex. #define PMATCH_SIZE 32 static gutenfetch_filter_t *ifilter[NUM_OF_IFILTER]; static char *filter_regex[] = { FILTER_GUTINDEX_OLD_ENTRY, FILTER_GUTINDEX_NEW_ENTRY, FILTER_AUTHOR, FILTER_OLD_TITLE, FILTER_NEW_TITLE, FILTER_LS_LR_DETAIL_DIRECTORY, FILTER_LS_LR_DETAIL_ENTRY, FILTER_LS_R_DETAIL_DIRECTORY, FILTER_LS_R_DETAIL_ENTRY, FILTER_FILENAME_BASE_EXT }; /** * gutenfetch_filter_init * * Initialize the filter routines. * * @return Returns GUTENFETCH_OK on success. */ gutenfetch_error_t gutenfetch_filter_init(void) { unsigned int i; for (i = 0;i < NUM_OF_IFILTER; ++i) { ifilter[i] = gutenfetch_filter_create(filter_regex[i], TRUE); } return GUTENFETCH_OK; } /** * gutenfetch_filter_shutdown * * Free any resources held by the filter routines. * * @return Returns GUTENFETCH_OK on success. */ gutenfetch_error_t gutenfetch_filter_shutdown(void) { unsigned int i; for (i = 0;i < NUM_OF_IFILTER; ++i) { gutenfetch_filter_destroy(ifilter[i]); } return GUTENFETCH_OK; } /** * gutenfetch_filter_create - * Create a filter based on regular expressions. * * @param pattern A regular expression which the * filter will match. * @param sub TRUE if we want to get the values of * the expression that match the regex, FALSE * if we are only interested in getting a TRUE/FALSE * response when we call filter_match. * @return A pointer to the filter we have created or * NULL if something went wrong. */ gutenfetch_filter_t * gutenfetch_filter_create(const char *pattern, int sub) { int ret; unsigned int flags = REG_EXTENDED; gutenfetch_filter_t *filter; filter = malloc(sizeof(gutenfetch_filter_t)); filter->regex = malloc(sizeof(regex_t)); filter->sub = sub; if (sub == FALSE) flags |= REG_NOSUB; if( (ret = regcomp(filter->regex, pattern, flags)) != 0) { gutenfetch_filter_destroy(filter); filter = NULL; } return filter; } /** * Release the resources used by a filter. * * @param filter The filter we wish to free. */ void gutenfetch_filter_destroy(gutenfetch_filter_t *filter) { regfree(filter->regex); FREE_NULL(filter->regex); FREE_NULL(filter); } /** * gutenfetch_ifilter_match * * Determine if a string matches a predefined regular expression. * * @param ifilter_index The unique index of the predefined regular * expression. * @param str The string to try to match with the filter. * @return Returns a list_t* which is NULL if no match was * made. It is a valid pointer otherwise, which points to the * matched sub strings in the string. */ list_t * gutenfetch_ifilter_match(int ifilter_index, const char *str) { return gutenfetch_filter_match(ifilter[ifilter_index], str); } /** * Determine if a string matches a regular expression. * * @param filter The filter to use when matching the string. * @param str The string to try to match with the regex in filter. * @return Returns a list_t* which is NULL if no match was made. * It is a valid pointer otherwise, which points to the matched * sub strings in str. */ list_t * gutenfetch_filter_match(gutenfetch_filter_t *filter, const char *str) { regmatch_t pmatch[PMATCH_SIZE]; list_t *subs = NULL; size_t sub_string_size; char *sub_string; int i, ret, nsubs; ret = regexec(filter->regex, str, PMATCH_SIZE, pmatch, 0); if (ret == 0) { // match if (filter->sub == TRUE) { // get internal matches nsubs = filter->regex->re_nsub + 1; for (i = 0;i < nsubs; ++i) { sub_string_size = pmatch[i].rm_eo - pmatch[i].rm_so; sub_string = malloc((sizeof(char) * sub_string_size) + 1); memcpy(sub_string, &str[pmatch[i].rm_so], sub_string_size); sub_string[sub_string_size] = '\0'; subs = list_append(subs, sub_string); sub_string = NULL; } } else { sub_string = strdup(str); if (sub_string != NULL) subs = list_append(subs, sub_string); } } /*else if (ret != REG_NOMATCH) { // We have an error condition } */ return subs; }