diff --git a/src/word_search/word_search.c b/src/word_search/word_search.c new file mode 100644 index 0000000..1551439 --- /dev/null +++ b/src/word_search/word_search.c @@ -0,0 +1,85 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* word_search.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: jguelen +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2025/03/02 14:27:58 by jguelen #+# #+# */ +/* Updated: 2025/03/02 17:16:39 by jguelen ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "word_search.h" + +/* +** @PARAM needle_len must be the length of the string needle. +** Calculates the Knuth-Morris-Pratt array for the word needle to then determine +** what shifts to make later for the searching window of the searching of needle +** in a text in the future. +** Returns the KMP array for needle or NULL if an allocation error occurs or +** needle is either NULL or needle_len is 0. +*/ +static int *create_kmp_array(char *needle, size_t needle_len) +{ + int *kmp; + size_t i; + int j; + + if (!needle || !needle_len) + return (NULL); + i = 0; + j = -1; + kmp = malloc((needle_len + 1) * sizeof(int)); + if (!kmp) + return (NULL); + kmp[0] = -1; + while (i < needle_len) + { + while (j > -1 && needle[i] != needle[j]) + j = kmp[j]; + i++; + j++; + if (needle[i] == needle[j]) + kmp[i] = kmp[j]; + else + kmp[i] = j; + } + return (kmp); +} + +/* +** @Param Should only be provided with non NULL or empty arguments +** Could be extended to report all occurrences of needle in haystack +** but for now only reports the first. +** (cf http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080) +*/ +ssize_t word_search_kmp(char *haystack, char *needle) +{ + int i; + int j; + int *kmp; + int needle_len; + int haystack_len; + + needle_len = ft_strlen(needle); + kmp = create_kmp_array(needle, needle_len); + if (!kmp) + return (-2); + i = 0; + j = 0; + haystack_len = ft_strlen(haystack); + while (j < haystack_len) + { + while (i > -1 && needle[i] != haystack[j]) + i = kmp[i]; + j++; + if (++i >= needle_len) + { + free(kmp); + return (j - i); + } + } + free(kmp); + return (-1); +} diff --git a/src/word_search/word_search.h b/src/word_search/word_search.h new file mode 100644 index 0000000..a76291a --- /dev/null +++ b/src/word_search/word_search.h @@ -0,0 +1,63 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* word_search.h :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: jguelen +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2025/03/02 14:10:36 by jguelen #+# #+# */ +/* Updated: 2025/03/02 14:30:16 by jguelen ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#ifndef WORD_SEARCH_H +# define WORD_SEARCH_H + +# include +# include "libft.h" + +/* +** An implementation of the Knuth, Morris and Pratt algorithm for exact word +** searching in a text. +** cf. http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080 +** +** The design of the Knuth-Morris-Pratt algorithm follows a tight analysis of +** the Morris and Pratt algorithm. Let us look more closely at the Morris-Pratt +** algorithm. It is possible to improve the length of the shifts. +** +** Consider an attempt at a left position j, that is when the the window is +** positioned on the text factor y[j .. j + m - 1]. Assume that the first +** mismatch occurs between x[i] and y[i+j] with 0 < i < m. Then, +** x[0 .. i - 1] = y[j .. i + j - 1] = u and a = x[i] != y[i + j] = b. +** +** When shifting, it is reasonable to expect that a prefix v of the pattern +** matches some suffix of the portion u of the text. Moreover, if we want to +** avoid another immediate mismatch, the character following the prefix v in the +** pattern must be different from a. The longest such prefix v is called the +** tagged border of u (it occurs at both ends of u followed by different +** characters in x). +** +** This introduces the notation: let kmpNext[i] be the length of the longest +** border of x[0 .. i - 1] followed by a character c different from x[i] and -1 +** if no such tagged border exits, for 0 < i leq m. Then, after a shift, the +** comparisons can resume between characters x[kmpNext[i]] and y[i+j] without +** missing any occurrence of x in y, and avoiding a backtrack on the text +** (see figure 7.1). The value of kmpNext[0] is set to -1. +** The table kmpNext can be computed in O(m) space and time before the searching +** phase, applying the same searching algorithm to the pattern itself, as if +** x = y. +** +** The searching phase can be performed in O(m + n) time. The Knuth-Morris-Pratt +** algorithm performs at most 2 * n - 1 text character comparisons during the +** searching phase. The delay (maximal number of comparisons for a single text +** character) is bounded by log_Phi(m) where Phi is the golden ratio. +*/ + +/* +** Uses the Knuth-Morris-Pratt algorithm. +** Returns the index where an occurrence of needle was found in haystack, or +** -1 if no such occurrence was found and -2 in case of allocation error. +*/ +ssize_t word_search_kmp(char *haystack, char *needle); + +#endif