mirror of
https://codeberg.org/la-chouette/minishell.git
synced 2025-12-06 07:28:09 +01:00
Expansion: Added research of words in text
An implementation of the Knuth, Morris and Pratt algorithm for the research of the first occurrence of a word in a text.
This commit is contained in:
parent
fa383c4f17
commit
768f1b3cb6
2 changed files with 148 additions and 0 deletions
85
src/word_search/word_search.c
Normal file
85
src/word_search/word_search.c
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
/* ************************************************************************** */
|
||||
/* */
|
||||
/* ::: :::::::: */
|
||||
/* word_search.c :+: :+: :+: */
|
||||
/* +:+ +:+ +:+ */
|
||||
/* By: jguelen <marvin@42.fr> +#+ +:+ +#+ */
|
||||
/* +#+#+#+#+#+ +#+ */
|
||||
/* Created: 2025/03/02 14:27:58 by jguelen #+# #+# */
|
||||
/* Updated: 2025/03/02 17:16:39 by jguelen ### ########.fr */
|
||||
/* */
|
||||
/* ************************************************************************** */
|
||||
|
||||
#include "word_search.h"
|
||||
|
||||
/*
|
||||
** @PARAM needle_len must be the length of the string needle.
|
||||
** Calculates the Knuth-Morris-Pratt array for the word needle to then determine
|
||||
** what shifts to make later for the searching window of the searching of needle
|
||||
** in a text in the future.
|
||||
** Returns the KMP array for needle or NULL if an allocation error occurs or
|
||||
** needle is either NULL or needle_len is 0.
|
||||
*/
|
||||
static int *create_kmp_array(char *needle, size_t needle_len)
|
||||
{
|
||||
int *kmp;
|
||||
size_t i;
|
||||
int j;
|
||||
|
||||
if (!needle || !needle_len)
|
||||
return (NULL);
|
||||
i = 0;
|
||||
j = -1;
|
||||
kmp = malloc((needle_len + 1) * sizeof(int));
|
||||
if (!kmp)
|
||||
return (NULL);
|
||||
kmp[0] = -1;
|
||||
while (i < needle_len)
|
||||
{
|
||||
while (j > -1 && needle[i] != needle[j])
|
||||
j = kmp[j];
|
||||
i++;
|
||||
j++;
|
||||
if (needle[i] == needle[j])
|
||||
kmp[i] = kmp[j];
|
||||
else
|
||||
kmp[i] = j;
|
||||
}
|
||||
return (kmp);
|
||||
}
|
||||
|
||||
/*
|
||||
** @Param Should only be provided with non NULL or empty arguments
|
||||
** Could be extended to report all occurrences of needle in haystack
|
||||
** but for now only reports the first.
|
||||
** (cf http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080)
|
||||
*/
|
||||
ssize_t word_search_kmp(char *haystack, char *needle)
|
||||
{
|
||||
int i;
|
||||
int j;
|
||||
int *kmp;
|
||||
int needle_len;
|
||||
int haystack_len;
|
||||
|
||||
needle_len = ft_strlen(needle);
|
||||
kmp = create_kmp_array(needle, needle_len);
|
||||
if (!kmp)
|
||||
return (-2);
|
||||
i = 0;
|
||||
j = 0;
|
||||
haystack_len = ft_strlen(haystack);
|
||||
while (j < haystack_len)
|
||||
{
|
||||
while (i > -1 && needle[i] != haystack[j])
|
||||
i = kmp[i];
|
||||
j++;
|
||||
if (++i >= needle_len)
|
||||
{
|
||||
free(kmp);
|
||||
return (j - i);
|
||||
}
|
||||
}
|
||||
free(kmp);
|
||||
return (-1);
|
||||
}
|
||||
63
src/word_search/word_search.h
Normal file
63
src/word_search/word_search.h
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
/* ************************************************************************** */
|
||||
/* */
|
||||
/* ::: :::::::: */
|
||||
/* word_search.h :+: :+: :+: */
|
||||
/* +:+ +:+ +:+ */
|
||||
/* By: jguelen <marvin@42.fr> +#+ +:+ +#+ */
|
||||
/* +#+#+#+#+#+ +#+ */
|
||||
/* Created: 2025/03/02 14:10:36 by jguelen #+# #+# */
|
||||
/* Updated: 2025/03/02 14:30:16 by jguelen ### ########.fr */
|
||||
/* */
|
||||
/* ************************************************************************** */
|
||||
|
||||
#ifndef WORD_SEARCH_H
|
||||
# define WORD_SEARCH_H
|
||||
|
||||
# include <stdlib.h>
|
||||
# include "libft.h"
|
||||
|
||||
/*
|
||||
** An implementation of the Knuth, Morris and Pratt algorithm for exact word
|
||||
** searching in a text.
|
||||
** cf. http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080
|
||||
**
|
||||
** The design of the Knuth-Morris-Pratt algorithm follows a tight analysis of
|
||||
** the Morris and Pratt algorithm. Let us look more closely at the Morris-Pratt
|
||||
** algorithm. It is possible to improve the length of the shifts.
|
||||
**
|
||||
** Consider an attempt at a left position j, that is when the the window is
|
||||
** positioned on the text factor y[j .. j + m - 1]. Assume that the first
|
||||
** mismatch occurs between x[i] and y[i+j] with 0 < i < m. Then,
|
||||
** x[0 .. i - 1] = y[j .. i + j - 1] = u and a = x[i] != y[i + j] = b.
|
||||
**
|
||||
** When shifting, it is reasonable to expect that a prefix v of the pattern
|
||||
** matches some suffix of the portion u of the text. Moreover, if we want to
|
||||
** avoid another immediate mismatch, the character following the prefix v in the
|
||||
** pattern must be different from a. The longest such prefix v is called the
|
||||
** tagged border of u (it occurs at both ends of u followed by different
|
||||
** characters in x).
|
||||
**
|
||||
** This introduces the notation: let kmpNext[i] be the length of the longest
|
||||
** border of x[0 .. i - 1] followed by a character c different from x[i] and -1
|
||||
** if no such tagged border exits, for 0 < i leq m. Then, after a shift, the
|
||||
** comparisons can resume between characters x[kmpNext[i]] and y[i+j] without
|
||||
** missing any occurrence of x in y, and avoiding a backtrack on the text
|
||||
** (see figure 7.1). The value of kmpNext[0] is set to -1.
|
||||
** The table kmpNext can be computed in O(m) space and time before the searching
|
||||
** phase, applying the same searching algorithm to the pattern itself, as if
|
||||
** x = y.
|
||||
**
|
||||
** The searching phase can be performed in O(m + n) time. The Knuth-Morris-Pratt
|
||||
** algorithm performs at most 2 * n - 1 text character comparisons during the
|
||||
** searching phase. The delay (maximal number of comparisons for a single text
|
||||
** character) is bounded by log_Phi(m) where Phi is the golden ratio.
|
||||
*/
|
||||
|
||||
/*
|
||||
** Uses the Knuth-Morris-Pratt algorithm.
|
||||
** Returns the index where an occurrence of needle was found in haystack, or
|
||||
** -1 if no such occurrence was found and -2 in case of allocation error.
|
||||
*/
|
||||
ssize_t word_search_kmp(char *haystack, char *needle);
|
||||
|
||||
#endif
|
||||
Loading…
Add table
Add a link
Reference in a new issue