From 3e64ac37692f34cf7fce9302d9d50586170b0420 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= Date: Wed, 19 Feb 2025 18:22:26 +0100 Subject: [PATCH] wordsplit: use individual rule funcs --- src/parser/wordsplit/tokenizing_1_5.c | 64 ++++++++++++++++++++- src/parser/wordsplit/tokenizing_6_10.c | 60 ++++++++++++++++++- src/parser/wordsplit/wordsplit.c | 80 +++++++------------------- src/parser/wordsplit/wordsplit.h | 18 ++++-- 4 files changed, 153 insertions(+), 69 deletions(-) diff --git a/src/parser/wordsplit/tokenizing_1_5.c b/src/parser/wordsplit/tokenizing_1_5.c index 0ca64de..622fbc5 100644 --- a/src/parser/wordsplit/tokenizing_1_5.c +++ b/src/parser/wordsplit/tokenizing_1_5.c @@ -6,7 +6,7 @@ /* By: jguelen +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2025/02/19 13:20:01 by jguelen #+# #+# */ -/* Updated: 2025/02/19 18:01:39 by jguelen ### ########.fr */ +/* Updated: 2025/02/19 18:27:37 by khais ### ########.fr */ /* */ /* ************************************************************************** */ @@ -16,12 +16,70 @@ ** cf. Token Recognition section at ** https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html */ + /* ** 1. If the end of input is recognized, the current token (if any) shall be ** delimited. */ -void token_rule_1(t_token_build *token_build, char *original) +bool token_rule_1(t_token_build *token_build, char *original) { - if (original[token_build.current_index] == '\0') + if (original[token_build->current_index] == '\0') + { token_build->wordlist = delimit(token_build); + return (true); + } + return (false); } + +/* +** 2. If the previous character was used as part of an operator and the current +** character is not quoted and can be used with the previous characters to form +** an operator, it shall be used as part of that (operator) token. +*/ +bool token_rule_2(t_token_build *token_build, char *original) +{ + if (token_build->currently_in_operator && token_build->quote == '\0' + && is_operator_combo(token_build->cur_token->buffer, original[token_build->current_index])) + { + token_build->cur_token = push_char(token_build->cur_token, original[token_build->current_index]); + return (true); + } + return (false); +} + +/* +** 3. If the previous character was used as part of an operator and the current +** character cannot be used with the previous characters to form an operator, +** the operator containing the previous character shall be delimited. +*/ +bool token_rule_3(t_token_build *token_build, char *original) +{ + if (token_build->currently_in_operator && token_build->quote == '\0' + && !is_operator_combo(token_build->cur_token->buffer, original[token_build->current_index])) + { + delimit(token_build); + return (true); + } + return (false); +} +/* +** 4. If the current character is single-quote, or double-quote and it is not +** quoted, it shall affect quoting for subsequent characters up to the end of +** the quoted text. The rules for quoting are as described in Quoting . The +** result token shall contain exactly the characters that appear in the input, +** unmodified, including any embedded or enclosing quotes or substitution +** operators, between the and the end of the quoted text. The +** token shall not be delimited by the end of the quoted field. +*/ +bool token_rule_4(t_token_build *token_build, char *original) +{ + if (original[idx] == '\'' || original[idx] == '"') + { + quote = quote_flip(&token, original[idx], quote); + return (true); + } + return (false); + +} + +bool token_rule_5(t_token_build *token_build, char *original); diff --git a/src/parser/wordsplit/tokenizing_6_10.c b/src/parser/wordsplit/tokenizing_6_10.c index cc316fe..47c2895 100644 --- a/src/parser/wordsplit/tokenizing_6_10.c +++ b/src/parser/wordsplit/tokenizing_6_10.c @@ -6,10 +6,68 @@ /* By: jguelen +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2025/02/19 13:21:18 by jguelen #+# #+# */ -/* Updated: 2025/02/19 13:21:36 by jguelen ### ########.fr */ +/* Updated: 2025/02/19 18:25:55 by khais ### ########.fr */ /* */ /* ************************************************************************** */ #include "wordsplit.h" +/* +** cf. Token Recognition section at +** https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html +*/ +/* +** 6. If the current character is not quoted and can be used as the first +** character of a new operator, the current token (if any) shall be delimited. +** The current character shall be used as the beginning of the next (operator) +** token. +*/ +bool token_rule_6(t_token_build *token_build, char *original) +{ + if (quote == '\0' && is_operator_start(original[idx])) + { + operator_start(&wordlist, &token, original[idx], + ¤tly_in_word, ¤tly_in_operator); + return (true); + } + return (false); +} + +/* +** 7. If the current character is an unquoted , any token containing the +** previous character is delimited and the current character shall be discarded. +*/ +bool token_rule_7(t_token_build *token_build, char *original); +{ + if (is_blank(original[idx]) && quote == '\0') + { + wordlist = delimit(wordlist, &token, ¤tly_in_word, + ¤tly_in_operator); + return (true); + } + return (false); +} + +/* +** 8. If the previous character was part of a word, the current character shall +** be appended to that word. +*/ +bool token_rule_8(t_token_build *token_build, char *original) +{ + if (currently_in_word) + { + token = push_char(token, original[idx]); + return (true); + } + return (false); +} + +/* +** 10. The current character is used as the start of a new word. +*/ +bool token_rule_10(t_token_build *token_build, char *original) +{ + token = new_word(token, original[idx], ¤tly_in_word); + return (true); +} diff --git a/src/parser/wordsplit/wordsplit.c b/src/parser/wordsplit/wordsplit.c index 59c8f6d..e4e8ed7 100644 --- a/src/parser/wordsplit/wordsplit.c +++ b/src/parser/wordsplit/wordsplit.c @@ -3,14 +3,15 @@ /* ::: :::::::: */ /* wordsplit.c :+: :+: :+: */ /* +:+ +:+ +:+ */ -/* By: jguelen +#+ +:+ +#+ */ +/* By: khais +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ -/* Created: 2025/02/13 17:02:32 by khais #+# #+# */ -/* Updated: 2025/02/19 16:58:47 by jguelen ### ########.fr */ +/* Created: 2025/02/19 18:26/57 by khais #+# #+# */ +/* Updated: 2025/02/19 18:26:57 by khais ### ########.fr */ /* */ /* ************************************************************************** */ #include "wordsplit.h" +#include "libft.h" /* ** split a string into words, respecting quotes etc. @@ -26,65 +27,24 @@ t_wordlist *minishell_wordsplit(char *original) { t_token_build token_build; - ft_bzero(&token_build); - while (true) + ft_bzero(&token_build, sizeof(t_token_build)); + while (!token_rule_1(&token_build, original)) { - // 1. If the end of input is recognized, the current token (if any) - // shall be delimited. - if (original[token_build.current_index] == '\0') - token_build.wordlist = delimit(wordlist, &token, ¤tly_in_word, - ¤tly_in_operator); - // 2. If the previous character was used as part of an operator and the - // current character is not quoted and can be used with the previous - // characters to form an operator, it shall be used as part of that - // (operator) token. - else if (token_build.currently_in_operator && token_build.quote == '\0' - && is_operator_combo(token_build.cur_token->buffer, original[token_build.current_index])) - token_build.cur_token = push_char(token, original[idx]); - // 3. If the previous character was used as part of an operator and the - // current character cannot be used with the previous characters to form - // an operator, the operator containing the previous character shall be - // delimited. - else if (currently_in_operator && quote == '\0' - && !is_operator_combo(token->buffer, original[idx])) - { - wordlist = delimit(wordlist, &token, ¤tly_in_word, - ¤tly_in_operator); + if (token_rule_2(&token_build, original)) continue ; - } - // 4. If the current character is single-quote, or double-quote and it - // is not quoted, it shall affect quoting for subsequent characters up - // to the end of the quoted text. The rules for quoting are as described - // in Quoting . The result token shall contain exactly the characters - // that appear in the input, unmodified, including any embedded or - // enclosing quotes or substitution operators, between the - // and the end of the quoted text. The token shall not - // be delimited by the end of the quoted field. - else if (original[idx] == '\'' || original[idx] == '"') - quote = quote_flip(&token, original[idx], quote); - // 6. If the current character is not quoted and can be used as the - // first character of a new operator, the current token (if any) shall - // be delimited. The current character shall be used as the beginning of - // the next (operator) token. - else if (quote == '\0' && is_operator_start(original[idx])) - operator_start(&wordlist, &token, original[idx], - ¤tly_in_word, ¤tly_in_operator); - // If the current character is an unquoted , any token containing - // the previous character is delimited and the current character shall - // be discarded. - else if (is_blank(original[idx]) && quote == '\0') - wordlist = delimit(wordlist, &token, ¤tly_in_word, - ¤tly_in_operator); - // If the previous character was part of a word, the current character - // shall be appended to that word. - else if (currently_in_word) - token = push_char(token, original[idx]); - // The current character is used as the start of a new word. - else - token = new_word(token, original[idx], ¤tly_in_word); - if (original[idx] == '\0') - break ; - idx++; + if (token_rule_3(&token_build, original)) + continue ; + if (token_rule_4(&token_build, original)) + continue ; + if (token_rule_5(&token_build, original)) + continue ; + if (token_rule_6(&token_build, original)) + continue ; + if (token_rule_7(&token_build, original)) + continue ; + if (token_rule_8(&token_build, original)) + continue ; + token_rule_10(&token_build, original); } if (token_build.quote != '\0') return (wordlist_destroy(token_build.wordlist), NULL); diff --git a/src/parser/wordsplit/wordsplit.h b/src/parser/wordsplit/wordsplit.h index c58656d..5bd6cc2 100644 --- a/src/parser/wordsplit/wordsplit.h +++ b/src/parser/wordsplit/wordsplit.h @@ -3,10 +3,10 @@ /* ::: :::::::: */ /* wordsplit.h :+: :+: :+: */ /* +:+ +:+ +:+ */ -/* By: jguelen +#+ +:+ +#+ */ +/* By: khais +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ -/* Created: 2025/02/13 15:52:48 by khais #+# #+# */ -/* Updated: 2025/02/19 18:02:50 by jguelen ### ########.fr */ +/* Created: 2025/02/19 18:22/52 by khais #+# #+# */ +/* Updated: 2025/02/19 18:22:52 by khais ### ########.fr */ /* */ /* ************************************************************************** */ @@ -36,8 +36,16 @@ t_buffer *push_char(t_token_build *token_build, char c); t_buffer *new_word(t_token_build *token_build, char c); char quote_flip(t_token_build *token_build, char c); void operator_start(t_token_build *token_build, char c); -void token_rule_1(t_token_build *token_build, char *original); +bool token_rule_1(t_token_build *token_build, char *original); +bool token_rule_2(t_token_build *token_build, char *original); +bool token_rule_3(t_token_build *token_build, char *original); +bool token_rule_4(t_token_build *token_build, char *original); +bool token_rule_5(t_token_build *token_build, char *original); +bool token_rule_6(t_token_build *token_build, char *original); +bool token_rule_7(t_token_build *token_build, char *original); +bool token_rule_8(t_token_build *token_build, char *original); +bool token_rule_10(t_token_build *token_build, char *original); t_wordlist *minishell_wordsplit(char *original); -#endif \ No newline at end of file +#endif