/ lib / libedit / src / tokenizer.c
tokenizer.c
  1  /*	$NetBSD: tokenizer.c,v 1.18 2010/01/03 18:27:10 christos Exp $	*/
  2  
  3  /*-
  4   * Copyright (c) 1992, 1993
  5   *	The Regents of the University of California.  All rights reserved.
  6   *
  7   * This code is derived from software contributed to Berkeley by
  8   * Christos Zoulas of Cornell University.
  9   *
 10   * Redistribution and use in source and binary forms, with or without
 11   * modification, are permitted provided that the following conditions
 12   * are met:
 13   * 1. Redistributions of source code must retain the above copyright
 14   *    notice, this list of conditions and the following disclaimer.
 15   * 2. Redistributions in binary form must reproduce the above copyright
 16   *    notice, this list of conditions and the following disclaimer in the
 17   *    documentation and/or other materials provided with the distribution.
 18   * 3. Neither the name of the University nor the names of its contributors
 19   *    may be used to endorse or promote products derived from this software
 20   *    without specific prior written permission.
 21   *
 22   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 23   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 24   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 25   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 26   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 28   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 29   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 32   * SUCH DAMAGE.
 33   */
 34  
 35  #ifndef NARROWCHAR
 36  #include "config.h"
 37  #endif
 38  
 39  #if !defined(lint) && !defined(SCCSID)
 40  #if 0
 41  static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
 42  #else
 43  __RCSID("$NetBSD: tokenizer.c,v 1.18 2010/01/03 18:27:10 christos Exp $");
 44  #endif
 45  #endif /* not lint && not SCCSID */
 46  
 47  /* We build this file twice, once as NARROW, once as WIDE. */
 48  /*
 49   * tokenize.c: Bourne shell like tokenizer
 50   */
 51  #include <string.h>
 52  #include <stdlib.h>
 53  #include "histedit.h"
 54  #include "chartype.h"
 55  
 56  typedef enum {
 57  	Q_none, Q_single, Q_double, Q_one, Q_doubleone
 58  } quote_t;
 59  
 60  #define	TOK_KEEP	1
 61  #define	TOK_EAT		2
 62  
 63  #define	WINCR		20
 64  #define	AINCR		10
 65  
 66  #define	IFS		STR("\t \n")
 67  
 68  #define	tok_malloc(a)		malloc(a)
 69  #define	tok_free(a)		free(a)
 70  #define	tok_realloc(a, b)	realloc(a, b)
 71  #define	tok_strdup(a)		Strdup(a)
 72  
 73  
 74  struct TYPE(tokenizer) {
 75  	Char	*ifs;		/* In field separator			 */
 76  	int	 argc, amax;	/* Current and maximum number of args	 */
 77  	Char   **argv;		/* Argument list			 */
 78  	Char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
 79  	Char	*wstart;	/* Beginning of next word		 */
 80  	Char	*wspace;	/* Space of word buffer			 */
 81  	quote_t	 quote;		/* Quoting state			 */
 82  	int	 flags;		/* flags;				 */
 83  };
 84  
 85  
 86  private void FUN(tok,finish)(TYPE(Tokenizer) *);
 87  
 88  
 89  /* FUN(tok,finish)():
 90   *	Finish a word in the tokenizer.
 91   */
 92  private void
 93  FUN(tok,finish)(TYPE(Tokenizer) *tok)
 94  {
 95  
 96  	*tok->wptr = '\0';
 97  	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
 98  		tok->argv[tok->argc++] = tok->wstart;
 99  		tok->argv[tok->argc] = NULL;
100  		tok->wstart = ++tok->wptr;
101  	}
102  	tok->flags &= ~TOK_KEEP;
103  }
104  
105  
106  /* FUN(tok,init)():
107   *	Initialize the tokenizer
108   */
109  public TYPE(Tokenizer) *
110  FUN(tok,init)(const Char *ifs)
111  {
112  	TYPE(Tokenizer) *tok = tok_malloc(sizeof(TYPE(Tokenizer)));
113  
114  	if (tok == NULL)
115  		return NULL;
116  	tok->ifs = tok_strdup(ifs ? ifs : IFS);
117  	if (tok->ifs == NULL) {
118  		tok_free((ptr_t)tok);
119  		return NULL;
120  	}
121  	tok->argc = 0;
122  	tok->amax = AINCR;
123  	tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
124  	if (tok->argv == NULL) {
125  		tok_free((ptr_t)tok->ifs);
126  		tok_free((ptr_t)tok);
127  		return NULL;
128  	}
129  	tok->argv[0] = NULL;
130  	tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
131  	if (tok->wspace == NULL) {
132  		tok_free((ptr_t)tok->argv);
133  		tok_free((ptr_t)tok->ifs);
134  		tok_free((ptr_t)tok);
135  		return NULL;
136  	}
137  	tok->wmax = tok->wspace + WINCR;
138  	tok->wstart = tok->wspace;
139  	tok->wptr = tok->wspace;
140  	tok->flags = 0;
141  	tok->quote = Q_none;
142  
143  	return (tok);
144  }
145  
146  
147  /* FUN(tok,reset)():
148   *	Reset the tokenizer
149   */
150  public void
151  FUN(tok,reset)(TYPE(Tokenizer) *tok)
152  {
153  
154  	tok->argc = 0;
155  	tok->wstart = tok->wspace;
156  	tok->wptr = tok->wspace;
157  	tok->flags = 0;
158  	tok->quote = Q_none;
159  }
160  
161  
162  /* FUN(tok,end)():
163   *	Clean up
164   */
165  public void
166  FUN(tok,end)(TYPE(Tokenizer) *tok)
167  {
168  
169  	tok_free((ptr_t) tok->ifs);
170  	tok_free((ptr_t) tok->wspace);
171  	tok_free((ptr_t) tok->argv);
172  	tok_free((ptr_t) tok);
173  }
174  
175  
176  
177  /* FUN(tok,line)():
178   *	Bourne shell (sh(1)) like tokenizing
179   *	Arguments:
180   *		tok	current tokenizer state (setup with FUN(tok,init)())
181   *		line	line to parse
182   *	Returns:
183   *		-1	Internal error
184   *		 3	Quoted return
185   *		 2	Unmatched double quote
186   *		 1	Unmatched single quote
187   *		 0	Ok
188   *	Modifies (if return value is 0):
189   *		argc	number of arguments
190   *		argv	argument array
191   *		cursorc	if !NULL, argv element containing cursor
192   *		cursorv	if !NULL, offset in argv[cursorc] of cursor
193   */
194  public int
195  FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
196      int *argc, const Char ***argv, int *cursorc, int *cursoro)
197  {
198  	const Char *ptr;
199  	int cc, co;
200  
201  	cc = co = -1;
202  	ptr = line->buffer;
203  	for (ptr = line->buffer; ;ptr++) {
204  		if (ptr >= line->lastchar)
205  			ptr = STR("");
206  		if (ptr == line->cursor) {
207  			cc = tok->argc;
208  			co = (int)(tok->wptr - tok->wstart);
209  		}
210  		switch (*ptr) {
211  		case '\'':
212  			tok->flags |= TOK_KEEP;
213  			tok->flags &= ~TOK_EAT;
214  			switch (tok->quote) {
215  			case Q_none:
216  				tok->quote = Q_single;	/* Enter single quote
217  							 * mode */
218  				break;
219  
220  			case Q_single:	/* Exit single quote mode */
221  				tok->quote = Q_none;
222  				break;
223  
224  			case Q_one:	/* Quote this ' */
225  				tok->quote = Q_none;
226  				*tok->wptr++ = *ptr;
227  				break;
228  
229  			case Q_double:	/* Stay in double quote mode */
230  				*tok->wptr++ = *ptr;
231  				break;
232  
233  			case Q_doubleone:	/* Quote this ' */
234  				tok->quote = Q_double;
235  				*tok->wptr++ = *ptr;
236  				break;
237  
238  			default:
239  				return (-1);
240  			}
241  			break;
242  
243  		case '"':
244  			tok->flags &= ~TOK_EAT;
245  			tok->flags |= TOK_KEEP;
246  			switch (tok->quote) {
247  			case Q_none:	/* Enter double quote mode */
248  				tok->quote = Q_double;
249  				break;
250  
251  			case Q_double:	/* Exit double quote mode */
252  				tok->quote = Q_none;
253  				break;
254  
255  			case Q_one:	/* Quote this " */
256  				tok->quote = Q_none;
257  				*tok->wptr++ = *ptr;
258  				break;
259  
260  			case Q_single:	/* Stay in single quote mode */
261  				*tok->wptr++ = *ptr;
262  				break;
263  
264  			case Q_doubleone:	/* Quote this " */
265  				tok->quote = Q_double;
266  				*tok->wptr++ = *ptr;
267  				break;
268  
269  			default:
270  				return (-1);
271  			}
272  			break;
273  
274  		case '\\':
275  			tok->flags |= TOK_KEEP;
276  			tok->flags &= ~TOK_EAT;
277  			switch (tok->quote) {
278  			case Q_none:	/* Quote next character */
279  				tok->quote = Q_one;
280  				break;
281  
282  			case Q_double:	/* Quote next character */
283  				tok->quote = Q_doubleone;
284  				break;
285  
286  			case Q_one:	/* Quote this, restore state */
287  				*tok->wptr++ = *ptr;
288  				tok->quote = Q_none;
289  				break;
290  
291  			case Q_single:	/* Stay in single quote mode */
292  				*tok->wptr++ = *ptr;
293  				break;
294  
295  			case Q_doubleone:	/* Quote this \ */
296  				tok->quote = Q_double;
297  				*tok->wptr++ = *ptr;
298  				break;
299  
300  			default:
301  				return (-1);
302  			}
303  			break;
304  
305  		case '\n':
306  			tok->flags &= ~TOK_EAT;
307  			switch (tok->quote) {
308  			case Q_none:
309  				goto tok_line_outok;
310  
311  			case Q_single:
312  			case Q_double:
313  				*tok->wptr++ = *ptr;	/* Add the return */
314  				break;
315  
316  			case Q_doubleone:   /* Back to double, eat the '\n' */
317  				tok->flags |= TOK_EAT;
318  				tok->quote = Q_double;
319  				break;
320  
321  			case Q_one:	/* No quote, more eat the '\n' */
322  				tok->flags |= TOK_EAT;
323  				tok->quote = Q_none;
324  				break;
325  
326  			default:
327  				return (0);
328  			}
329  			break;
330  
331  		case '\0':
332  			switch (tok->quote) {
333  			case Q_none:
334  				/* Finish word and return */
335  				if (tok->flags & TOK_EAT) {
336  					tok->flags &= ~TOK_EAT;
337  					return (3);
338  				}
339  				goto tok_line_outok;
340  
341  			case Q_single:
342  				return (1);
343  
344  			case Q_double:
345  				return (2);
346  
347  			case Q_doubleone:
348  				tok->quote = Q_double;
349  				*tok->wptr++ = *ptr;
350  				break;
351  
352  			case Q_one:
353  				tok->quote = Q_none;
354  				*tok->wptr++ = *ptr;
355  				break;
356  
357  			default:
358  				return (-1);
359  			}
360  			break;
361  
362  		default:
363  			tok->flags &= ~TOK_EAT;
364  			switch (tok->quote) {
365  			case Q_none:
366  				if (Strchr(tok->ifs, *ptr) != NULL)
367  					FUN(tok,finish)(tok);
368  				else
369  					*tok->wptr++ = *ptr;
370  				break;
371  
372  			case Q_single:
373  			case Q_double:
374  				*tok->wptr++ = *ptr;
375  				break;
376  
377  
378  			case Q_doubleone:
379  				*tok->wptr++ = '\\';
380  				tok->quote = Q_double;
381  				*tok->wptr++ = *ptr;
382  				break;
383  
384  			case Q_one:
385  				tok->quote = Q_none;
386  				*tok->wptr++ = *ptr;
387  				break;
388  
389  			default:
390  				return (-1);
391  
392  			}
393  			break;
394  		}
395  
396  		if (tok->wptr >= tok->wmax - 4) {
397  			size_t size = tok->wmax - tok->wspace + WINCR;
398  			Char *s = tok_realloc(tok->wspace,
399  			    size * sizeof(*s));
400  			if (s == NULL)
401  				return (-1);
402  
403  			if (s != tok->wspace) {
404  				int i;
405  				for (i = 0; i < tok->argc; i++) {
406  				    tok->argv[i] =
407  					(tok->argv[i] - tok->wspace) + s;
408  				}
409  				tok->wptr = (tok->wptr - tok->wspace) + s;
410  				tok->wstart = (tok->wstart - tok->wspace) + s;
411  				tok->wspace = s;
412  			}
413  			tok->wmax = s + size;
414  		}
415  		if (tok->argc >= tok->amax - 4) {
416  			Char **p;
417  			tok->amax += AINCR;
418  			p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
419  			if (p == NULL)
420  				return (-1);
421  			tok->argv = p;
422  		}
423  	}
424   tok_line_outok:
425  	if (cc == -1 && co == -1) {
426  		cc = tok->argc;
427  		co = (int)(tok->wptr - tok->wstart);
428  	}
429  	if (cursorc != NULL)
430  		*cursorc = cc;
431  	if (cursoro != NULL)
432  		*cursoro = co;
433  	FUN(tok,finish)(tok);
434  	*argv = (const Char **)tok->argv;
435  	*argc = tok->argc;
436  	return (0);
437  }
438  
439  /* FUN(tok,str)():
440   *	Simpler version of tok_line, taking a NUL terminated line
441   *	and splitting into words, ignoring cursor state.
442   */
443  public int
444  FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
445      const Char ***argv)
446  {
447  	TYPE(LineInfo) li;
448  
449  	memset(&li, 0, sizeof(li));
450  	li.buffer = line;
451  	li.cursor = li.lastchar = Strchr(line, '\0');
452  	return (FUN(tok,line)(tok, &li, argc, argv, NULL, NULL));
453  }