Expansion: Added research of words in text

An implementation of the Knuth, Morris and Pratt algorithm for the research
of the first occurrence of a word in a text.
This commit is contained in:
Jérôme Guélen 2025-03-02 17:22:11 +01:00
parent fa383c4f17
commit 768f1b3cb6
No known key found for this signature in database
2 changed files with 148 additions and 0 deletions

View file

@ -0,0 +1,85 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* word_search.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: jguelen <marvin@42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2025/03/02 14:27:58 by jguelen #+# #+# */
/* Updated: 2025/03/02 17:16:39 by jguelen ### ########.fr */
/* */
/* ************************************************************************** */
#include "word_search.h"
/*
** @PARAM needle_len must be the length of the string needle.
** Calculates the Knuth-Morris-Pratt array for the word needle to then determine
** what shifts to make later for the searching window of the searching of needle
** in a text in the future.
** Returns the KMP array for needle or NULL if an allocation error occurs or
** needle is either NULL or needle_len is 0.
*/
static int *create_kmp_array(char *needle, size_t needle_len)
{
int *kmp;
size_t i;
int j;
if (!needle || !needle_len)
return (NULL);
i = 0;
j = -1;
kmp = malloc((needle_len + 1) * sizeof(int));
if (!kmp)
return (NULL);
kmp[0] = -1;
while (i < needle_len)
{
while (j > -1 && needle[i] != needle[j])
j = kmp[j];
i++;
j++;
if (needle[i] == needle[j])
kmp[i] = kmp[j];
else
kmp[i] = j;
}
return (kmp);
}
/*
** @Param Should only be provided with non NULL or empty arguments
** Could be extended to report all occurrences of needle in haystack
** but for now only reports the first.
** (cf http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080)
*/
ssize_t word_search_kmp(char *haystack, char *needle)
{
int i;
int j;
int *kmp;
int needle_len;
int haystack_len;
needle_len = ft_strlen(needle);
kmp = create_kmp_array(needle, needle_len);
if (!kmp)
return (-2);
i = 0;
j = 0;
haystack_len = ft_strlen(haystack);
while (j < haystack_len)
{
while (i > -1 && needle[i] != haystack[j])
i = kmp[i];
j++;
if (++i >= needle_len)
{
free(kmp);
return (j - i);
}
}
free(kmp);
return (-1);
}

View file

@ -0,0 +1,63 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* word_search.h :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: jguelen <marvin@42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2025/03/02 14:10:36 by jguelen #+# #+# */
/* Updated: 2025/03/02 14:30:16 by jguelen ### ########.fr */
/* */
/* ************************************************************************** */
#ifndef WORD_SEARCH_H
# define WORD_SEARCH_H
# include <stdlib.h>
# include "libft.h"
/*
** An implementation of the Knuth, Morris and Pratt algorithm for exact word
** searching in a text.
** cf. http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080
**
** The design of the Knuth-Morris-Pratt algorithm follows a tight analysis of
** the Morris and Pratt algorithm. Let us look more closely at the Morris-Pratt
** algorithm. It is possible to improve the length of the shifts.
**
** Consider an attempt at a left position j, that is when the the window is
** positioned on the text factor y[j .. j + m - 1]. Assume that the first
** mismatch occurs between x[i] and y[i+j] with 0 < i < m. Then,
** x[0 .. i - 1] = y[j .. i + j - 1] = u and a = x[i] != y[i + j] = b.
**
** When shifting, it is reasonable to expect that a prefix v of the pattern
** matches some suffix of the portion u of the text. Moreover, if we want to
** avoid another immediate mismatch, the character following the prefix v in the
** pattern must be different from a. The longest such prefix v is called the
** tagged border of u (it occurs at both ends of u followed by different
** characters in x).
**
** This introduces the notation: let kmpNext[i] be the length of the longest
** border of x[0 .. i - 1] followed by a character c different from x[i] and -1
** if no such tagged border exits, for 0 < i leq m. Then, after a shift, the
** comparisons can resume between characters x[kmpNext[i]] and y[i+j] without
** missing any occurrence of x in y, and avoiding a backtrack on the text
** (see figure 7.1). The value of kmpNext[0] is set to -1.
** The table kmpNext can be computed in O(m) space and time before the searching
** phase, applying the same searching algorithm to the pattern itself, as if
** x = y.
**
** The searching phase can be performed in O(m + n) time. The Knuth-Morris-Pratt
** algorithm performs at most 2 * n - 1 text character comparisons during the
** searching phase. The delay (maximal number of comparisons for a single text
** character) is bounded by log_Phi(m) where Phi is the golden ratio.
*/
/*
** Uses the Knuth-Morris-Pratt algorithm.
** Returns the index where an occurrence of needle was found in haystack, or
** -1 if no such occurrence was found and -2 in case of allocation error.
*/
ssize_t word_search_kmp(char *haystack, char *needle);
#endif