wordsplit: use individual rule funcs

2025-12-06 07:28:09 +01:00 · 2025-02-19 18:22:26 +01:00 · 2025-02-19 18:22:26 +01:00 · 3e64ac3769
commit 3e64ac3769
parent 58be71725b
4 changed files with 153 additions and 69 deletions
--- a/src/parser/wordsplit/tokenizing_1_5.c
+++ b/src/parser/wordsplit/tokenizing_1_5.c
@ -6,7 +6,7 @@
 /*   By: jguelen <marvin@42.fr>                     +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2025/02/19 13:20:01 by jguelen           #+#    #+#             */
-/*   Updated: 2025/02/19 18:01:39 by jguelen          ###   ########.fr       */
+/*   Updated: 2025/02/19 18:27:37 by khais            ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

@ -16,12 +16,70 @@
 ** cf. Token Recognition section at
 ** https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
 */
+
 /*
 ** 1. If the end of input is recognized, the current token (if any) shall be
 ** delimited.
 */
-void	token_rule_1(t_token_build *token_build, char *original)
+bool	token_rule_1(t_token_build *token_build, char *original)
 {
-	if (original[token_build.current_index] == '\0')
+	if (original[token_build->current_index] == '\0')
+	{
 		token_build->wordlist = delimit(token_build);
+		return (true);
+	}
+	return (false);
 }
+
+/*
+** 2. If the previous character was used as part of an operator and the current
+** character is not quoted and can be used with the previous characters to form
+** an operator, it shall be used as part of that (operator) token.
+*/
+bool	token_rule_2(t_token_build *token_build, char *original)
+{
+	if (token_build->currently_in_operator && token_build->quote == '\0'
+		&& is_operator_combo(token_build->cur_token->buffer, original[token_build->current_index]))
+	{
+		token_build->cur_token = push_char(token_build->cur_token, original[token_build->current_index]);
+		return (true);
+	}
+	return (false);
+}
+
+/*
+** 3. If the previous character was used as part of an operator and the current
+** character cannot be used with the previous characters to form an operator,
+** the operator containing the previous character shall be delimited.
+*/
+bool		token_rule_3(t_token_build *token_build, char *original)
+{
+	if (token_build->currently_in_operator && token_build->quote == '\0'
+			&& !is_operator_combo(token_build->cur_token->buffer, original[token_build->current_index]))
+	{
+		delimit(token_build);
+		return (true);
+	}
+	return (false);
+}
+/*
+** 4. If the current character is single-quote, or double-quote and it is not
+** quoted, it shall affect quoting for subsequent characters up to the end of
+** the quoted text. The rules for quoting are as described in Quoting . The
+** result token shall contain exactly the characters that appear in the input,
+** unmodified, including any embedded or enclosing quotes or substitution
+** operators, between the <quotation-mark> and the end of the quoted text. The
+** token shall not be delimited by the end of the quoted field.
+*/
+bool		token_rule_4(t_token_build *token_build, char *original)
+{
+	if (original[idx] == '\'' || original[idx] == '"')
+	{
+			quote = quote_flip(&token, original[idx], quote);
+			return (true);
+	}
+	return (false);
+
+}
+
+bool		token_rule_5(t_token_build *token_build, char *original);
--- a/src/parser/wordsplit/tokenizing_6_10.c
+++ b/src/parser/wordsplit/tokenizing_6_10.c
@ -6,10 +6,68 @@
 /*   By: jguelen <marvin@42.fr>                     +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2025/02/19 13:21:18 by jguelen           #+#    #+#             */
-/*   Updated: 2025/02/19 13:21:36 by jguelen          ###   ########.fr       */
+/*   Updated: 2025/02/19 18:25:55 by khais            ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

 #include "wordsplit.h"

+/*
+** cf. Token Recognition section at
+** https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
+*/

+/*
+** 6. If the current character is not quoted and can be used as the first
+** character of a new operator, the current token (if any) shall be delimited.
+** The current character shall be used as the beginning of the next (operator)
+** token.
+*/
+bool		token_rule_6(t_token_build *token_build, char *original)
+{
+	if (quote == '\0' && is_operator_start(original[idx]))
+	{
+		operator_start(&wordlist, &token, original[idx],
+			&currently_in_word, &currently_in_operator);
+		return (true);
+	}
+	return (false);
+}
+
+/*
+** 7. If the current character is an unquoted <blank>, any token containing the
+** previous character is delimited and the current character shall be discarded.
+*/
+bool		token_rule_7(t_token_build *token_build, char *original);
+{
+	if (is_blank(original[idx]) && quote == '\0')
+	{
+		wordlist = delimit(wordlist, &token, &currently_in_word,
+				&currently_in_operator);
+		return (true);
+	}
+	return (false);
+}
+
+/*
+** 8. If the previous character was part of a word, the current character shall
+** be appended to that word.
+*/
+bool		token_rule_8(t_token_build *token_build, char *original)
+{
+	if (currently_in_word)
+	{
+		token = push_char(token, original[idx]);
+		return (true);
+	}
+	return (false);
+}
+
+/*
+** 10. The current character is used as the start of a new word.
+*/
+bool		token_rule_10(t_token_build *token_build, char *original)
+{
+	token = new_word(token, original[idx], &currently_in_word);
+	return (true);
+}
--- a/src/parser/wordsplit/wordsplit.c
+++ b/src/parser/wordsplit/wordsplit.c
@ -3,14 +3,15 @@
 /*                                                        :::      ::::::::   */
 /*   wordsplit.c                                        :+:      :+:    :+:   */
 /*                                                    +:+ +:+         +:+     */
-/*   By: jguelen <jguelen@student.42.fr>            +#+  +:+       +#+        */
+/*   By: khais <marvin@42.fr>                       +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2025/02/13 17:02:32 by khais             #+#    #+#             */
-/*   Updated: 2025/02/19 16:58:47 by jguelen          ###   ########.fr       */
+/*   Created: 2025/02/19 18:26/57 by khais             #+#    #+#             */
+/*   Updated: 2025/02/19 18:26:57 by khais            ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

 #include "wordsplit.h"
+#include "libft.h"

 /*
 ** split a string into words, respecting quotes etc.
@ -26,65 +27,24 @@ t_wordlist	*minishell_wordsplit(char *original)
 {
 	t_token_build	token_build;

-	ft_bzero(&token_build);
-	while (true)
+	ft_bzero(&token_build, sizeof(t_token_build));
+	while (!token_rule_1(&token_build, original))
 	{
-		// 1. If the end of input is recognized, the current token (if any)
-		// shall be delimited.
-		if (original[token_build.current_index] == '\0')
-			token_build.wordlist = delimit(wordlist, &token, &currently_in_word,
-					&currently_in_operator);
-		// 2. If the previous character was used as part of an operator and the
-		// current character is not quoted and can be used with the previous
-		// characters to form an operator, it shall be used as part of that
-		// (operator) token.
-		else if (token_build.currently_in_operator && token_build.quote == '\0'
-			&& is_operator_combo(token_build.cur_token->buffer, original[token_build.current_index]))
-			token_build.cur_token = push_char(token, original[idx]);
-		// 3. If the previous character was used as part of an operator and the
-		// current character cannot be used with the previous characters to form
-		// an operator, the operator containing the previous character shall be
-		// delimited.
-		else if (currently_in_operator && quote == '\0'
-			&& !is_operator_combo(token->buffer, original[idx]))
-		{
-			wordlist = delimit(wordlist, &token, &currently_in_word,
-					&currently_in_operator);
+		if (token_rule_2(&token_build, original))
 			continue ;
-		}
-		// 4. If the current character is single-quote, or double-quote and it
-		// is not quoted, it shall affect quoting for subsequent characters up
-		// to the end of the quoted text. The rules for quoting are as described
-		// in Quoting . The result token shall contain exactly the characters
-		// that appear in the input, unmodified, including any embedded or
-		// enclosing quotes or substitution operators, between the
-		// <quotation-mark> and the end of the quoted text. The token shall not
-		// be delimited by the end of the quoted field.
-		else if (original[idx] == '\'' || original[idx] == '"')
-			quote = quote_flip(&token, original[idx], quote);
-		// 6. If the current character is not quoted and can be used as the
-		// first character of a new operator, the current token (if any) shall
-		// be delimited. The current character shall be used as the beginning of
-		// the next (operator) token.
-		else if (quote == '\0' && is_operator_start(original[idx]))
-			operator_start(&wordlist, &token, original[idx],
-				&currently_in_word, &currently_in_operator);
-		// If the current character is an unquoted <blank>, any token containing
-		// the previous character is delimited and the current character shall
-		// be discarded.
-		else if (is_blank(original[idx]) && quote == '\0')
-			wordlist = delimit(wordlist, &token, &currently_in_word,
-					&currently_in_operator);
-		// If the previous character was part of a word, the current character
-		// shall be appended to that word.
-		else if (currently_in_word)
-			token = push_char(token, original[idx]);
-		// The current character is used as the start of a new word.
-		else
-			token = new_word(token, original[idx], &currently_in_word);
-		if (original[idx] == '\0')
-			break ;
-		idx++;
+		if (token_rule_3(&token_build, original))
+			continue ;
+		if (token_rule_4(&token_build, original))
+			continue ;
+		if (token_rule_5(&token_build, original))
+			continue ;
+		if (token_rule_6(&token_build, original))
+			continue ;
+		if (token_rule_7(&token_build, original))
+			continue ;
+		if (token_rule_8(&token_build, original))
+			continue ;
+		token_rule_10(&token_build, original);
 	}
 	if (token_build.quote != '\0')
 		return (wordlist_destroy(token_build.wordlist), NULL);
--- a/src/parser/wordsplit/wordsplit.h
+++ b/src/parser/wordsplit/wordsplit.h
@ -3,10 +3,10 @@
 /*                                                        :::      ::::::::   */
 /*   wordsplit.h                                        :+:      :+:    :+:   */
 /*                                                    +:+ +:+         +:+     */
-/*   By: jguelen <jguelen@student.42.fr>            +#+  +:+       +#+        */
+/*   By: khais <marvin@42.fr>                       +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2025/02/13 15:52:48 by khais             #+#    #+#             */
-/*   Updated: 2025/02/19 18:02:50 by jguelen          ###   ########.fr       */
+/*   Created: 2025/02/19 18:22/52 by khais             #+#    #+#             */
+/*   Updated: 2025/02/19 18:22:52 by khais            ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

@ -36,7 +36,15 @@ t_buffer	*push_char(t_token_build *token_build, char c);
 t_buffer	*new_word(t_token_build *token_build, char c);
 char		quote_flip(t_token_build *token_build, char c);
 void		operator_start(t_token_build *token_build, char c);
-void		token_rule_1(t_token_build *token_build, char *original);
+bool		token_rule_1(t_token_build *token_build, char *original);
+bool		token_rule_2(t_token_build *token_build, char *original);
+bool		token_rule_3(t_token_build *token_build, char *original);
+bool		token_rule_4(t_token_build *token_build, char *original);
+bool		token_rule_5(t_token_build *token_build, char *original);
+bool		token_rule_6(t_token_build *token_build, char *original);
+bool		token_rule_7(t_token_build *token_build, char *original);
+bool		token_rule_8(t_token_build *token_build, char *original);
+bool		token_rule_10(t_token_build *token_build, char *original);

 t_wordlist	*minishell_wordsplit(char *original);