001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3.text;
018
019 import java.util.ArrayList;
020 import java.util.Collections;
021 import java.util.List;
022 import java.util.ListIterator;
023 import java.util.NoSuchElementException;
024
025 import org.apache.commons.lang3.ArrayUtils;
026
027 /**
028 * Tokenizes a string based based on delimiters (separators)
029 * and supporting quoting and ignored character concepts.
030 * <p>
031 * This class can split a String into many smaller strings. It aims
032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033 * however it offers much more control and flexibility including implementing
034 * the <code>ListIterator</code> interface. By default, it is set up
035 * like <code>StringTokenizer</code>.
036 * <p>
037 * The input String is split into a number of <i>tokens</i>.
038 * Each token is separated from the next String by a <i>delimiter</i>.
039 * One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes.
042 * The <i>quote</i> matcher specifies the quote character(s).
043 * A quote may be escaped within a quoted section by duplicating itself.
044 * <p>
045 * Between each token and the delimiter are potentially characters that need trimming.
046 * The <i>trimmer</i> matcher specifies these characters.
047 * One usage might be to trim whitespace characters.
048 * <p>
049 * At any point outside the quotes there might potentially be invalid characters.
050 * The <i>ignored</i> matcher specifies these characters to be removed.
051 * One usage might be to remove new line characters.
052 * <p>
053 * Empty tokens may be removed or returned as null.
054 * <pre>
055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058 * </pre>
059 * <p>
060 *
061 * This tokenizer has the following properties and options:
062 *
063 * <table>
064 * <tr>
065 * <th>Property</th><th>Type</th><th>Default</th>
066 * </tr>
067 * <tr>
068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069 * </tr>
070 * <tr>
071 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
072 * </tr>
073 * <tr>
074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075 * </tr>
076 * <tr>
077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078 * </tr>
079 * <tr>
080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081 * </tr>
082 * </table>
083 *
084 * @author Apache Software Foundation
085 * @author Matthew Inger
086 * @author Gary D. Gregory
087 * @since 2.2
088 * @version $Id: StrTokenizer.java 907630 2010-02-08 12:22:32Z sebb $
089 */
090 public class StrTokenizer implements ListIterator<String>, Cloneable {
091
092 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
093 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
094 static {
095 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
096 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
097 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
098 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
099 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
100 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
101 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
102
103 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
104 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
105 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
106 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
107 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
108 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110 }
111
112 /** The text to work on. */
113 private char chars[];
114 /** The parsed tokens */
115 private String tokens[];
116 /** The current iteration position */
117 private int tokenPos;
118
119 /** The delimiter matcher */
120 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
121 /** The quote matcher */
122 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
123 /** The ignored matcher */
124 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
125 /** The trimmer matcher */
126 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
127
128 /** Whether to return empty tokens as null */
129 private boolean emptyAsNull = false;
130 /** Whether to ignore empty tokens */
131 private boolean ignoreEmptyTokens = true;
132
133 //-----------------------------------------------------------------------
134
135 /**
136 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137 *
138 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
139 */
140 private static StrTokenizer getCSVClone() {
141 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
142 }
143
144 /**
145 * Gets a new tokenizer instance which parses Comma Separated Value strings
146 * initializing it with the given input. The default for CSV processing
147 * will be trim whitespace from both ends (which can be overridden with
148 * the setTrimmer method).
149 * <p>
150 * You must call a "reset" method to set the string which you want to parse.
151 * @return a new tokenizer instance which parses Comma Separated Value strings
152 */
153 public static StrTokenizer getCSVInstance() {
154 return getCSVClone();
155 }
156
157 /**
158 * Gets a new tokenizer instance which parses Comma Separated Value strings
159 * initializing it with the given input. The default for CSV processing
160 * will be trim whitespace from both ends (which can be overridden with
161 * the setTrimmer method).
162 *
163 * @param input the text to parse
164 * @return a new tokenizer instance which parses Comma Separated Value strings
165 */
166 public static StrTokenizer getCSVInstance(String input) {
167 StrTokenizer tok = getCSVClone();
168 tok.reset(input);
169 return tok;
170 }
171
172 /**
173 * Gets a new tokenizer instance which parses Comma Separated Value strings
174 * initializing it with the given input. The default for CSV processing
175 * will be trim whitespace from both ends (which can be overridden with
176 * the setTrimmer method).
177 *
178 * @param input the text to parse
179 * @return a new tokenizer instance which parses Comma Separated Value strings
180 */
181 public static StrTokenizer getCSVInstance(char[] input) {
182 StrTokenizer tok = getCSVClone();
183 tok.reset(input);
184 return tok;
185 }
186
187 /**
188 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189 *
190 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
191 */
192 private static StrTokenizer getTSVClone() {
193 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
194 }
195
196
197 /**
198 * Gets a new tokenizer instance which parses Tab Separated Value strings.
199 * The default for CSV processing will be trim whitespace from both ends
200 * (which can be overridden with the setTrimmer method).
201 * <p>
202 * You must call a "reset" method to set the string which you want to parse.
203 * @return a new tokenizer instance which parses Tab Separated Value strings.
204 */
205 public static StrTokenizer getTSVInstance() {
206 return getTSVClone();
207 }
208
209 /**
210 * Gets a new tokenizer instance which parses Tab Separated Value strings.
211 * The default for CSV processing will be trim whitespace from both ends
212 * (which can be overridden with the setTrimmer method).
213 * @param input the string to parse
214 * @return a new tokenizer instance which parses Tab Separated Value strings.
215 */
216 public static StrTokenizer getTSVInstance(String input) {
217 StrTokenizer tok = getTSVClone();
218 tok.reset(input);
219 return tok;
220 }
221
222 /**
223 * Gets a new tokenizer instance which parses Tab Separated Value strings.
224 * The default for CSV processing will be trim whitespace from both ends
225 * (which can be overridden with the setTrimmer method).
226 * @param input the string to parse
227 * @return a new tokenizer instance which parses Tab Separated Value strings.
228 */
229 public static StrTokenizer getTSVInstance(char[] input) {
230 StrTokenizer tok = getTSVClone();
231 tok.reset(input);
232 return tok;
233 }
234
235 //-----------------------------------------------------------------------
236 /**
237 * Constructs a tokenizer splitting on space, tab, newline and formfeed
238 * as per StringTokenizer, but with no text to tokenize.
239 * <p>
240 * This constructor is normally used with {@link #reset(String)}.
241 */
242 public StrTokenizer() {
243 super();
244 this.chars = null;
245 }
246
247 /**
248 * Constructs a tokenizer splitting on space, tab, newline and formfeed
249 * as per StringTokenizer.
250 *
251 * @param input the string which is to be parsed
252 */
253 public StrTokenizer(String input) {
254 super();
255 if (input != null) {
256 chars = input.toCharArray();
257 } else {
258 chars = null;
259 }
260 }
261
262 /**
263 * Constructs a tokenizer splitting on the specified delimiter character.
264 *
265 * @param input the string which is to be parsed
266 * @param delim the field delimiter character
267 */
268 public StrTokenizer(String input, char delim) {
269 this(input);
270 setDelimiterChar(delim);
271 }
272
273 /**
274 * Constructs a tokenizer splitting on the specified delimiter string.
275 *
276 * @param input the string which is to be parsed
277 * @param delim the field delimiter string
278 */
279 public StrTokenizer(String input, String delim) {
280 this(input);
281 setDelimiterString(delim);
282 }
283
284 /**
285 * Constructs a tokenizer splitting using the specified delimiter matcher.
286 *
287 * @param input the string which is to be parsed
288 * @param delim the field delimiter matcher
289 */
290 public StrTokenizer(String input, StrMatcher delim) {
291 this(input);
292 setDelimiterMatcher(delim);
293 }
294
295 /**
296 * Constructs a tokenizer splitting on the specified delimiter character
297 * and handling quotes using the specified quote character.
298 *
299 * @param input the string which is to be parsed
300 * @param delim the field delimiter character
301 * @param quote the field quoted string character
302 */
303 public StrTokenizer(String input, char delim, char quote) {
304 this(input, delim);
305 setQuoteChar(quote);
306 }
307
308 /**
309 * Constructs a tokenizer splitting using the specified delimiter matcher
310 * and handling quotes using the specified quote matcher.
311 *
312 * @param input the string which is to be parsed
313 * @param delim the field delimiter matcher
314 * @param quote the field quoted string matcher
315 */
316 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
317 this(input, delim);
318 setQuoteMatcher(quote);
319 }
320
321 /**
322 * Constructs a tokenizer splitting on space, tab, newline and formfeed
323 * as per StringTokenizer.
324 *
325 * @param input the string which is to be parsed, not cloned
326 */
327 public StrTokenizer(char[] input) {
328 super();
329 this.chars = ArrayUtils.clone(input);
330 }
331
332 /**
333 * Constructs a tokenizer splitting on the specified character.
334 *
335 * @param input the string which is to be parsed, not cloned
336 * @param delim the field delimiter character
337 */
338 public StrTokenizer(char[] input, char delim) {
339 this(input);
340 setDelimiterChar(delim);
341 }
342
343 /**
344 * Constructs a tokenizer splitting on the specified string.
345 *
346 * @param input the string which is to be parsed, not cloned
347 * @param delim the field delimiter string
348 */
349 public StrTokenizer(char[] input, String delim) {
350 this(input);
351 setDelimiterString(delim);
352 }
353
354 /**
355 * Constructs a tokenizer splitting using the specified delimiter matcher.
356 *
357 * @param input the string which is to be parsed, not cloned
358 * @param delim the field delimiter matcher
359 */
360 public StrTokenizer(char[] input, StrMatcher delim) {
361 this(input);
362 setDelimiterMatcher(delim);
363 }
364
365 /**
366 * Constructs a tokenizer splitting on the specified delimiter character
367 * and handling quotes using the specified quote character.
368 *
369 * @param input the string which is to be parsed, not cloned
370 * @param delim the field delimiter character
371 * @param quote the field quoted string character
372 */
373 public StrTokenizer(char[] input, char delim, char quote) {
374 this(input, delim);
375 setQuoteChar(quote);
376 }
377
378 /**
379 * Constructs a tokenizer splitting using the specified delimiter matcher
380 * and handling quotes using the specified quote matcher.
381 *
382 * @param input the string which is to be parsed, not cloned
383 * @param delim the field delimiter character
384 * @param quote the field quoted string character
385 */
386 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
387 this(input, delim);
388 setQuoteMatcher(quote);
389 }
390
391 // API
392 //-----------------------------------------------------------------------
393 /**
394 * Gets the number of tokens found in the String.
395 *
396 * @return the number of matched tokens
397 */
398 public int size() {
399 checkTokenized();
400 return tokens.length;
401 }
402
403 /**
404 * Gets the next token from the String.
405 * Equivalent to {@link #next()} except it returns null rather than
406 * throwing {@link NoSuchElementException} when no tokens remain.
407 *
408 * @return the next sequential token, or null when no more tokens are found
409 */
410 public String nextToken() {
411 if (hasNext()) {
412 return tokens[tokenPos++];
413 }
414 return null;
415 }
416
417 /**
418 * Gets the previous token from the String.
419 *
420 * @return the previous sequential token, or null when no more tokens are found
421 */
422 public String previousToken() {
423 if (hasPrevious()) {
424 return tokens[--tokenPos];
425 }
426 return null;
427 }
428
429 /**
430 * Gets a copy of the full token list as an independent modifiable array.
431 *
432 * @return the tokens as a String array
433 */
434 public String[] getTokenArray() {
435 checkTokenized();
436 return tokens.clone();
437 }
438
439 /**
440 * Gets a copy of the full token list as an independent modifiable list.
441 *
442 * @return the tokens as a String array
443 */
444 public List<String> getTokenList() {
445 checkTokenized();
446 List<String> list = new ArrayList<String>(tokens.length);
447 for (String element : tokens) {
448 list.add(element);
449 }
450 return list;
451 }
452
453 /**
454 * Resets this tokenizer, forgetting all parsing and iteration already completed.
455 * <p>
456 * This method allows the same tokenizer to be reused for the same String.
457 *
458 * @return this, to enable chaining
459 */
460 public StrTokenizer reset() {
461 tokenPos = 0;
462 tokens = null;
463 return this;
464 }
465
466 /**
467 * Reset this tokenizer, giving it a new input string to parse.
468 * In this manner you can re-use a tokenizer with the same settings
469 * on multiple input lines.
470 *
471 * @param input the new string to tokenize, null sets no text to parse
472 * @return this, to enable chaining
473 */
474 public StrTokenizer reset(String input) {
475 reset();
476 if (input != null) {
477 this.chars = input.toCharArray();
478 } else {
479 this.chars = null;
480 }
481 return this;
482 }
483
484 /**
485 * Reset this tokenizer, giving it a new input string to parse.
486 * In this manner you can re-use a tokenizer with the same settings
487 * on multiple input lines.
488 *
489 * @param input the new character array to tokenize, not cloned, null sets no text to parse
490 * @return this, to enable chaining
491 */
492 public StrTokenizer reset(char[] input) {
493 reset();
494 this.chars = ArrayUtils.clone(input);
495 return this;
496 }
497
498 // ListIterator
499 //-----------------------------------------------------------------------
500 /**
501 * Checks whether there are any more tokens.
502 *
503 * @return true if there are more tokens
504 */
505 public boolean hasNext() {
506 checkTokenized();
507 return tokenPos < tokens.length;
508 }
509
510 /**
511 * Gets the next token.
512 *
513 * @return the next String token
514 * @throws NoSuchElementException if there are no more elements
515 */
516 public String next() {
517 if (hasNext()) {
518 return tokens[tokenPos++];
519 }
520 throw new NoSuchElementException();
521 }
522
523 /**
524 * Gets the index of the next token to return.
525 *
526 * @return the next token index
527 */
528 public int nextIndex() {
529 return tokenPos;
530 }
531
532 /**
533 * Checks whether there are any previous tokens that can be iterated to.
534 *
535 * @return true if there are previous tokens
536 */
537 public boolean hasPrevious() {
538 checkTokenized();
539 return tokenPos > 0;
540 }
541
542 /**
543 * Gets the token previous to the last returned token.
544 *
545 * @return the previous token
546 */
547 public String previous() {
548 if (hasPrevious()) {
549 return tokens[--tokenPos];
550 }
551 throw new NoSuchElementException();
552 }
553
554 /**
555 * Gets the index of the previous token.
556 *
557 * @return the previous token index
558 */
559 public int previousIndex() {
560 return tokenPos - 1;
561 }
562
563 /**
564 * Unsupported ListIterator operation.
565 *
566 * @throws UnsupportedOperationException always
567 */
568 public void remove() {
569 throw new UnsupportedOperationException("remove() is unsupported");
570 }
571
572 /**
573 * Unsupported ListIterator operation.
574 * @param obj this parameter ignored.
575 * @throws UnsupportedOperationException always
576 */
577 public void set(String obj) {
578 throw new UnsupportedOperationException("set() is unsupported");
579 }
580
581 /**
582 * Unsupported ListIterator operation.
583 * @param obj this parameter ignored.
584 * @throws UnsupportedOperationException always
585 */
586 public void add(String obj) {
587 throw new UnsupportedOperationException("add() is unsupported");
588 }
589
590 // Implementation
591 //-----------------------------------------------------------------------
592 /**
593 * Checks if tokenization has been done, and if not then do it.
594 */
595 private void checkTokenized() {
596 if (tokens == null) {
597 if (chars == null) {
598 // still call tokenize as subclass may do some work
599 List<String> split = tokenize(null, 0, 0);
600 tokens = split.toArray(new String[split.size()]);
601 } else {
602 List<String> split = tokenize(chars, 0, chars.length);
603 tokens = split.toArray(new String[split.size()]);
604 }
605 }
606 }
607
608 /**
609 * Internal method to performs the tokenization.
610 * <p>
611 * Most users of this class do not need to call this method. This method
612 * will be called automatically by other (public) methods when required.
613 * <p>
614 * This method exists to allow subclasses to add code before or after the
615 * tokenization. For example, a subclass could alter the character array,
616 * offset or count to be parsed, or call the tokenizer multiple times on
617 * multiple strings. It is also be possible to filter the results.
618 * <p>
619 * <code>StrTokenizer</code> will always pass a zero offset and a count
620 * equal to the length of the array to this method, however a subclass
621 * may pass other values, or even an entirely different array.
622 *
623 * @param chars the character array being tokenized, may be null
624 * @param offset the start position within the character array, must be valid
625 * @param count the number of characters to tokenize, must be valid
626 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
627 */
628 protected List<String> tokenize(char[] chars, int offset, int count) {
629 if (chars == null || count == 0) {
630 return Collections.emptyList();
631 }
632 StrBuilder buf = new StrBuilder();
633 List<String> tokens = new ArrayList<String>();
634 int pos = offset;
635
636 // loop around the entire buffer
637 while (pos >= 0 && pos < count) {
638 // find next token
639 pos = readNextToken(chars, pos, count, buf, tokens);
640
641 // handle case where end of string is a delimiter
642 if (pos >= count) {
643 addToken(tokens, "");
644 }
645 }
646 return tokens;
647 }
648
649 /**
650 * Adds a token to a list, paying attention to the parameters we've set.
651 *
652 * @param list the list to add to
653 * @param tok the token to add
654 */
655 private void addToken(List<String> list, String tok) {
656 if (tok == null || tok.length() == 0) {
657 if (isIgnoreEmptyTokens()) {
658 return;
659 }
660 if (isEmptyTokenAsNull()) {
661 tok = null;
662 }
663 }
664 list.add(tok);
665 }
666
667 /**
668 * Reads character by character through the String to get the next token.
669 *
670 * @param chars the character array being tokenized
671 * @param start the first character of field
672 * @param len the length of the character array being tokenized
673 * @param workArea a temporary work area
674 * @param tokens the list of parsed tokens
675 * @return the starting position of the next field (the character
676 * immediately after the delimiter), or -1 if end of string found
677 */
678 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
679 // skip all leading whitespace, unless it is the
680 // field delimiter or the quote character
681 while (start < len) {
682 int removeLen = Math.max(
683 getIgnoredMatcher().isMatch(chars, start, start, len),
684 getTrimmerMatcher().isMatch(chars, start, start, len));
685 if (removeLen == 0 ||
686 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
687 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
688 break;
689 }
690 start += removeLen;
691 }
692
693 // handle reaching end
694 if (start >= len) {
695 addToken(tokens, "");
696 return -1;
697 }
698
699 // handle empty token
700 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
701 if (delimLen > 0) {
702 addToken(tokens, "");
703 return start + delimLen;
704 }
705
706 // handle found token
707 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
708 if (quoteLen > 0) {
709 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
710 }
711 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
712 }
713
714 /**
715 * Reads a possibly quoted string token.
716 *
717 * @param chars the character array being tokenized
718 * @param start the first character of field
719 * @param len the length of the character array being tokenized
720 * @param workArea a temporary work area
721 * @param tokens the list of parsed tokens
722 * @param quoteStart the start position of the matched quote, 0 if no quoting
723 * @param quoteLen the length of the matched quote, 0 if no quoting
724 * @return the starting position of the next field (the character
725 * immediately after the delimiter, or if end of string found,
726 * then the length of string
727 */
728 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea,
729 List<String> tokens, int quoteStart, int quoteLen)
730 {
731 // Loop until we've found the end of the quoted
732 // string or the end of the input
733 workArea.clear();
734 int pos = start;
735 boolean quoting = (quoteLen > 0);
736 int trimStart = 0;
737
738 while (pos < len) {
739 // quoting mode can occur several times throughout a string
740 // we must switch between quoting and non-quoting until we
741 // encounter a non-quoted delimiter, or end of string
742 if (quoting) {
743 // In quoting mode
744
745 // If we've found a quote character, see if it's
746 // followed by a second quote. If so, then we need
747 // to actually put the quote character into the token
748 // rather than end the token.
749 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
750 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
751 // matched pair of quotes, thus an escaped quote
752 workArea.append(chars, pos, quoteLen);
753 pos += (quoteLen * 2);
754 trimStart = workArea.size();
755 continue;
756 }
757
758 // end of quoting
759 quoting = false;
760 pos += quoteLen;
761 continue;
762 }
763
764 // copy regular character from inside quotes
765 workArea.append(chars[pos++]);
766 trimStart = workArea.size();
767
768 } else {
769 // Not in quoting mode
770
771 // check for delimiter, and thus end of token
772 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
773 if (delimLen > 0) {
774 // return condition when end of token found
775 addToken(tokens, workArea.substring(0, trimStart));
776 return pos + delimLen;
777 }
778
779 // check for quote, and thus back into quoting mode
780 if (quoteLen > 0) {
781 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
782 quoting = true;
783 pos += quoteLen;
784 continue;
785 }
786 }
787
788 // check for ignored (outside quotes), and ignore
789 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
790 if (ignoredLen > 0) {
791 pos += ignoredLen;
792 continue;
793 }
794
795 // check for trimmed character
796 // don't yet know if its at the end, so copy to workArea
797 // use trimStart to keep track of trim at the end
798 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
799 if (trimmedLen > 0) {
800 workArea.append(chars, pos, trimmedLen);
801 pos += trimmedLen;
802 continue;
803 }
804
805 // copy regular character from outside quotes
806 workArea.append(chars[pos++]);
807 trimStart = workArea.size();
808 }
809 }
810
811 // return condition when end of string found
812 addToken(tokens, workArea.substring(0, trimStart));
813 return -1;
814 }
815
816 /**
817 * Checks if the characters at the index specified match the quote
818 * already matched in readNextToken().
819 *
820 * @param chars the character array being tokenized
821 * @param pos the position to check for a quote
822 * @param len the length of the character array being tokenized
823 * @param quoteStart the start position of the matched quote, 0 if no quoting
824 * @param quoteLen the length of the matched quote, 0 if no quoting
825 * @return true if a quote is matched
826 */
827 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
828 for (int i = 0; i < quoteLen; i++) {
829 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
830 return false;
831 }
832 }
833 return true;
834 }
835
836 // Delimiter
837 //-----------------------------------------------------------------------
838 /**
839 * Gets the field delimiter matcher.
840 *
841 * @return the delimiter matcher in use
842 */
843 public StrMatcher getDelimiterMatcher() {
844 return this.delimMatcher;
845 }
846
847 /**
848 * Sets the field delimiter matcher.
849 * <p>
850 * The delimitier is used to separate one token from another.
851 *
852 * @param delim the delimiter matcher to use
853 * @return this, to enable chaining
854 */
855 public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
856 if (delim == null) {
857 this.delimMatcher = StrMatcher.noneMatcher();
858 } else {
859 this.delimMatcher = delim;
860 }
861 return this;
862 }
863
864 /**
865 * Sets the field delimiter character.
866 *
867 * @param delim the delimiter character to use
868 * @return this, to enable chaining
869 */
870 public StrTokenizer setDelimiterChar(char delim) {
871 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
872 }
873
874 /**
875 * Sets the field delimiter string.
876 *
877 * @param delim the delimiter string to use
878 * @return this, to enable chaining
879 */
880 public StrTokenizer setDelimiterString(String delim) {
881 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
882 }
883
884 // Quote
885 //-----------------------------------------------------------------------
886 /**
887 * Gets the quote matcher currently in use.
888 * <p>
889 * The quote character is used to wrap data between the tokens.
890 * This enables delimiters to be entered as data.
891 * The default value is '"' (double quote).
892 *
893 * @return the quote matcher in use
894 */
895 public StrMatcher getQuoteMatcher() {
896 return quoteMatcher;
897 }
898
899 /**
900 * Set the quote matcher to use.
901 * <p>
902 * The quote character is used to wrap data between the tokens.
903 * This enables delimiters to be entered as data.
904 *
905 * @param quote the quote matcher to use, null ignored
906 * @return this, to enable chaining
907 */
908 public StrTokenizer setQuoteMatcher(StrMatcher quote) {
909 if (quote != null) {
910 this.quoteMatcher = quote;
911 }
912 return this;
913 }
914
915 /**
916 * Sets the quote character to use.
917 * <p>
918 * The quote character is used to wrap data between the tokens.
919 * This enables delimiters to be entered as data.
920 *
921 * @param quote the quote character to use
922 * @return this, to enable chaining
923 */
924 public StrTokenizer setQuoteChar(char quote) {
925 return setQuoteMatcher(StrMatcher.charMatcher(quote));
926 }
927
928 // Ignored
929 //-----------------------------------------------------------------------
930 /**
931 * Gets the ignored character matcher.
932 * <p>
933 * These characters are ignored when parsing the String, unless they are
934 * within a quoted region.
935 * The default value is not to ignore anything.
936 *
937 * @return the ignored matcher in use
938 */
939 public StrMatcher getIgnoredMatcher() {
940 return ignoredMatcher;
941 }
942
943 /**
944 * Set the matcher for characters to ignore.
945 * <p>
946 * These characters are ignored when parsing the String, unless they are
947 * within a quoted region.
948 *
949 * @param ignored the ignored matcher to use, null ignored
950 * @return this, to enable chaining
951 */
952 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
953 if (ignored != null) {
954 this.ignoredMatcher = ignored;
955 }
956 return this;
957 }
958
959 /**
960 * Set the character to ignore.
961 * <p>
962 * This character is ignored when parsing the String, unless it is
963 * within a quoted region.
964 *
965 * @param ignored the ignored character to use
966 * @return this, to enable chaining
967 */
968 public StrTokenizer setIgnoredChar(char ignored) {
969 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
970 }
971
972 // Trimmer
973 //-----------------------------------------------------------------------
974 /**
975 * Gets the trimmer character matcher.
976 * <p>
977 * These characters are trimmed off on each side of the delimiter
978 * until the token or quote is found.
979 * The default value is not to trim anything.
980 *
981 * @return the trimmer matcher in use
982 */
983 public StrMatcher getTrimmerMatcher() {
984 return trimmerMatcher;
985 }
986
987 /**
988 * Sets the matcher for characters to trim.
989 * <p>
990 * These characters are trimmed off on each side of the delimiter
991 * until the token or quote is found.
992 *
993 * @param trimmer the trimmer matcher to use, null ignored
994 * @return this, to enable chaining
995 */
996 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
997 if (trimmer != null) {
998 this.trimmerMatcher = trimmer;
999 }
1000 return this;
1001 }
1002
1003 //-----------------------------------------------------------------------
1004 /**
1005 * Gets whether the tokenizer currently returns empty tokens as null.
1006 * The default for this property is false.
1007 *
1008 * @return true if empty tokens are returned as null
1009 */
1010 public boolean isEmptyTokenAsNull() {
1011 return this.emptyAsNull;
1012 }
1013
1014 /**
1015 * Sets whether the tokenizer should return empty tokens as null.
1016 * The default for this property is false.
1017 *
1018 * @param emptyAsNull whether empty tokens are returned as null
1019 * @return this, to enable chaining
1020 */
1021 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1022 this.emptyAsNull = emptyAsNull;
1023 return this;
1024 }
1025
1026 //-----------------------------------------------------------------------
1027 /**
1028 * Gets whether the tokenizer currently ignores empty tokens.
1029 * The default for this property is true.
1030 *
1031 * @return true if empty tokens are not returned
1032 */
1033 public boolean isIgnoreEmptyTokens() {
1034 return ignoreEmptyTokens;
1035 }
1036
1037 /**
1038 * Sets whether the tokenizer should ignore and not return empty tokens.
1039 * The default for this property is true.
1040 *
1041 * @param ignoreEmptyTokens whether empty tokens are not returned
1042 * @return this, to enable chaining
1043 */
1044 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1045 this.ignoreEmptyTokens = ignoreEmptyTokens;
1046 return this;
1047 }
1048
1049 //-----------------------------------------------------------------------
1050 /**
1051 * Gets the String content that the tokenizer is parsing.
1052 *
1053 * @return the string content being parsed
1054 */
1055 public String getContent() {
1056 if (chars == null) {
1057 return null;
1058 }
1059 return new String(chars);
1060 }
1061
1062 //-----------------------------------------------------------------------
1063 /**
1064 * Creates a new instance of this Tokenizer. The new instance is reset so
1065 * that it will be at the start of the token list.
1066 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1067 *
1068 * @return a new instance of this Tokenizer which has been reset.
1069 */
1070 @Override
1071 public Object clone() {
1072 try {
1073 return cloneReset();
1074 } catch (CloneNotSupportedException ex) {
1075 return null;
1076 }
1077 }
1078
1079 /**
1080 * Creates a new instance of this Tokenizer. The new instance is reset so that
1081 * it will be at the start of the token list.
1082 *
1083 * @return a new instance of this Tokenizer which has been reset.
1084 * @throws CloneNotSupportedException if there is a problem cloning
1085 */
1086 Object cloneReset() throws CloneNotSupportedException {
1087 // this method exists to enable 100% test coverage
1088 StrTokenizer cloned = (StrTokenizer) super.clone();
1089 if (cloned.chars != null) {
1090 cloned.chars = cloned.chars.clone();
1091 }
1092 cloned.reset();
1093 return cloned;
1094 }
1095
1096 //-----------------------------------------------------------------------
1097 /**
1098 * Gets the String content that the tokenizer is parsing.
1099 *
1100 * @return the string content being parsed
1101 */
1102 @Override
1103 public String toString() {
1104 if (tokens == null) {
1105 return "StrTokenizer[not tokenized yet]";
1106 }
1107 return "StrTokenizer" + getTokenList();
1108 }
1109
1110 }