Expansion: Added research of words in text

An implementation of the Knuth, Morris and Pratt algorithm for the research of the first occurrence of a word in a text.
2025-12-06 07:28:09 +01:00 · 2025-03-02 17:22:11 +01:00 · 2025-03-02 17:22:11 +01:00 · 768f1b3cb6
commit 768f1b3cb6
parent fa383c4f17
2 changed files with 148 additions and 0 deletions
--- a/src/word_search/word_search.c
+++ b/src/word_search/word_search.c
@ -0,0 +1,85 @@
+/* ************************************************************************** */
+/*                                                                            */
+/*                                                        :::      ::::::::   */
+/*   word_search.c                                      :+:      :+:    :+:   */
+/*                                                    +:+ +:+         +:+     */
+/*   By: jguelen <marvin@42.fr>                     +#+  +:+       +#+        */
+/*                                                +#+#+#+#+#+   +#+           */
+/*   Created: 2025/03/02 14:27:58 by jguelen           #+#    #+#             */
+/*   Updated: 2025/03/02 17:16:39 by jguelen          ###   ########.fr       */
+/*                                                                            */
+/* ************************************************************************** */
+
+#include "word_search.h"
+
+/*
+** @PARAM needle_len must be the length of the string needle.
+** Calculates the Knuth-Morris-Pratt array for the word needle to then determine
+** what shifts to make later for the searching window of the searching of needle
+** in a text in the future.
+** Returns the KMP array for needle or NULL if an allocation error occurs or
+** needle is either NULL or needle_len is 0.
+*/
+static int	*create_kmp_array(char *needle, size_t needle_len)
+{
+	int		*kmp;
+	size_t	i;
+	int		j;
+
+	if (!needle || !needle_len)
+		return (NULL);
+	i = 0;
+	j = -1;
+	kmp = malloc((needle_len + 1) * sizeof(int));
+	if (!kmp)
+		return (NULL);
+	kmp[0] = -1;
+	while (i < needle_len)
+	{
+		while (j > -1 && needle[i] != needle[j])
+			j = kmp[j];
+		i++;
+		j++;
+		if (needle[i] == needle[j])
+			kmp[i] = kmp[j];
+		else
+			kmp[i] = j;
+	}
+	return (kmp);
+}
+
+/*
+** @Param Should only be provided with non NULL or empty arguments
+** Could be extended to report all occurrences of needle in haystack
+** but for now only reports the first.
+** (cf http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080)
+*/
+ssize_t	word_search_kmp(char *haystack, char *needle)
+{
+	int		i;
+	int		j;
+	int		*kmp;
+	int		needle_len;
+	int		haystack_len;
+
+	needle_len = ft_strlen(needle);
+	kmp = create_kmp_array(needle, needle_len);
+	if (!kmp)
+		return (-2);
+	i = 0;
+	j = 0;
+	haystack_len = ft_strlen(haystack);
+	while (j < haystack_len)
+	{
+		while (i > -1 && needle[i] != haystack[j])
+			i = kmp[i];
+		j++;
+		if (++i >= needle_len)
+		{
+			free(kmp);
+			return (j - i);
+		}
+	}
+	free(kmp);
+	return (-1);
+}
--- a/src/word_search/word_search.h
+++ b/src/word_search/word_search.h
@ -0,0 +1,63 @@
+/* ************************************************************************** */
+/*                                                                            */
+/*                                                        :::      ::::::::   */
+/*   word_search.h                                      :+:      :+:    :+:   */
+/*                                                    +:+ +:+         +:+     */
+/*   By: jguelen <marvin@42.fr>                     +#+  +:+       +#+        */
+/*                                                +#+#+#+#+#+   +#+           */
+/*   Created: 2025/03/02 14:10:36 by jguelen           #+#    #+#             */
+/*   Updated: 2025/03/02 14:30:16 by jguelen          ###   ########.fr       */
+/*                                                                            */
+/* ************************************************************************** */
+
+#ifndef WORD_SEARCH_H
+# define WORD_SEARCH_H
+
+# include <stdlib.h>
+# include "libft.h"
+
+/*
+** An implementation of the Knuth, Morris and Pratt algorithm for exact word
+** searching in a text.
+** cf. http://monge.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080
+**
+** The design of the Knuth-Morris-Pratt algorithm follows a tight analysis of
+** the Morris and Pratt algorithm. Let us look more closely at the Morris-Pratt
+** algorithm. It is possible to improve the length of the shifts.
+**
+** Consider an attempt at a left position j, that is when the the window is
+** positioned on the text factor y[j .. j + m - 1]. Assume that the first
+** mismatch occurs between x[i] and y[i+j] with 0 < i < m. Then,
+** x[0 .. i - 1] = y[j .. i + j - 1] = u and a = x[i] != y[i + j] = b.
+**
+** When shifting, it is reasonable to expect that a prefix v of the pattern
+** matches some suffix of the portion u of the text. Moreover, if we want to
+** avoid another immediate mismatch, the character following the prefix v in the
+** pattern must be different from a. The longest such prefix v is called the
+** tagged border of u (it occurs at both ends of u followed by different
+** characters in x).
+**
+** This introduces the notation: let kmpNext[i] be the length of the longest
+** border of x[0 .. i - 1] followed by a character c different from x[i] and -1
+** if no such tagged border exits, for 0 < i leq m. Then, after a shift, the
+** comparisons can resume between characters x[kmpNext[i]] and y[i+j] without
+** missing any occurrence of x in y, and avoiding a backtrack on the text
+** (see figure 7.1). The value of kmpNext[0] is set to -1.
+** The table kmpNext can be computed in O(m) space and time before the searching
+** phase, applying the same searching algorithm to the pattern itself, as if
+** x = y.
+**
+** The searching phase can be performed in O(m + n) time. The Knuth-Morris-Pratt
+** algorithm performs at most 2 * n - 1 text character comparisons during the
+** searching phase. The delay (maximal number of comparisons for a single text
+** character) is bounded by log_Phi(m) where Phi is the golden ratio.
+*/
+
+/*
+** Uses the Knuth-Morris-Pratt algorithm.
+** Returns the index where an occurrence of needle was found in haystack, or
+** -1 if no such occurrence was found and -2 in case of allocation error.
+*/
+ssize_t	word_search_kmp(char *haystack, char *needle);
+
+#endif