package journal.reader; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.text.ParseException; public class Tokenizer { private BufferedReader reader; private StringReader lineReader = null; private int line = 0; private int column = 0; private boolean atLeftMargin = true; private Token lastToken = null; public Tokenizer(Reader reader) { this.reader = new BufferedReader(reader); } // returns next token or NULL if EOL reached // throws ParseException if error state reached public Token nextToken() throws ParseException, IOException { // if previous token was pushed, return that if (lastToken != null) { Token returnToken = lastToken; lastToken = null; return returnToken; } // read first char // if @, ignore @, keep reading till next @ or EOL (=> error) // otherwise, needs to be number or hex (otherwise => error) // Token token = new Token(column, line, atLeftMargin); while (true) { int ch = readChar(); if (ch == -1) { return null; } else if (ch == '@') { return eatStringToken(token); } else if (isHex(ch) || ch == '-') { return eatNumberToken(token, ch); } else if (ch == '\n') { token.setAtLeftMargin(true); continue; // not a token, just end of line. Keep looking } else if (ch == ' ') { // ignore random space continue; } else { throw new ParseException("Could not interpret character " + ch + "(" + (char) ch + ")", line); } } } public void pushToken(Token token) { lastToken = token; } private Token eatNumberToken(Token token, int ch) throws IOException, ParseException { StringBuffer buffer = new StringBuffer(); // add this digit to the token // keep reading until non-digit is reached Token.Type type = Token.Type.INTEGER_TOKEN; do { buffer.append((char)ch); if (type == Token.Type.INTEGER_TOKEN && (isHexChar(ch))) type = Token.Type.HEX_TOKEN; if (isBase64Char(ch)) type = Token.Type.BASE64_TOKEN; ch = readChar(); } while (isHex(ch)); verifyWhitespaceOrEndOfLine(ch); token.setValue(buffer.toString(), type); return token; } private Token eatStringToken(Token token) throws IOException, ParseException { int ch; // read and add to token until closing '@' is reached // or end of file, which is an error StringBuffer buffer = new StringBuffer(); while (true) { ch = readChar(); while (ch != '@') { if (ch == -1) { throw new ParseException("Unexpected end of file encountered", line); } buffer.append((char) ch); ch = readChar(); } ch = readChar(); // this removes the whitespace after the closing '@' - or checks for @@ if (ch == '@') { buffer.append('@'); // special case, @ is encoded as @@ } else { verifyWhitespaceOrEndOfLine(ch); token.setValue(buffer.toString(), token.isAtLeftMargin() ? Token.Type.START_TOKEN : Token.Type.STRING_TOKEN); return token; } } } private void verifyWhitespaceOrEndOfLine(int ch) throws ParseException { if (!Character.isWhitespace(ch) && ch != -1) { throw new ParseException("Not a whitespace as expected : " + ch + " '" + (char) ch + "'", line); } } private boolean isHex(int ch) { return Character.isDigit(ch) || isHexChar(ch) || isBase64Char(ch); } private boolean isHexChar(int ch) { return ch == 'A' || ch == 'B' || ch == 'C' || ch == 'D' || ch == 'E' || ch == 'F'; } private boolean isBase64Char(int ch) { String base64chars = "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; return base64chars.indexOf(ch) >= 0; } private int readChar() throws IOException { if (lineReader == null) { if (nextLine()) return -1; } int ch = lineReader.read(); atLeftMargin = false; if (ch == -1) { // end reached if (nextLine()) return -1; ch = '\n'; } column++; return ch; } // return true if end-of-file reached // otherwise false private boolean nextLine() throws IOException { String aLine = reader.readLine(); if (aLine == null) { return true; } lineReader = new StringReader(aLine); ++line; column = 0; atLeftMargin = true; return false; } private static void testTokenizer(String test) { Reader reader = new StringReader(test); Tokenizer tokenizer = new Tokenizer(reader); Token token; try { token = tokenizer.nextToken(); while (token != null) { System.out.println("Found : " + token); token = tokenizer.nextToken(); } } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { // Testing methods // for now only command line, later special tests, maybe throw TestUnit if (args.length == 0) { testTokenizer("@pv@ 4 @db.have@ 12345 "); testTokenizer("@dv@ 1 @db.rev@ 123 3245 456 7 "); testTokenizer("@rv@ 2 @db.foo@ @a blank in here does not matter@\n"); testTokenizer("@vv@ 1 @db.bar@ @so\nwhat\nif\nthere\nare\nreturns@\n"); testTokenizer("@pv@ 7 @db.working@ @@@ex@@@ 1 @ha ha ha@\n"); testTokenizer("@ex@ 1\n@vv@ @db.counter@ @journal@ 2FD005066B82F8949B0E8ADEA6582C74 1\n"); testTokenizer("@pv@ 7 @db.rev@ 07BzhNET7exJ6qYjitX/AA"); } else { try { Tokenizer tokenizer = new Tokenizer(new FileReader(args[0])); Token token = tokenizer.nextToken(); while (token != null) { System.out.println("Found :" + token); token = tokenizer.nextToken(); } } catch (Exception e) { e.printStackTrace(); } } } public int getColumn() { return column; } public int getLine() { return line; } }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#2 | 23518 | Sven Erik Knop | Upgrade the Tokenizer to deal with Base64 encoded numbers. | ||
#1 | 7589 | Sven Erik Knop | Rescue attempt to recover missing files from the JournalReader | ||
//guest/sven_erik_knop/JournalReader/src/journal/reader/Tokenizer.java | |||||
#2 | 7375 | Sven Erik Knop |
Major update of the JournalReader. Complete rewrite of the command line parsing Change in the options parsing within the journal reader New SQLLoader action. Currently only against MySQL (needs MySQL JDBC driver) with fixed database and user name. This will be replaced by a config file at some stage. |
||
#1 | 7374 | Sven Erik Knop | Rename/move file(s) - correct location for Eclipse project | ||
//guest/sven_erik_knop/JournalReader/journal/reader/Tokenizer.java | |||||
#1 | 6467 | Sven Erik Knop |
Added JournalReader, a Java library of useful tools to read and process checkpoints and journals. Added are a readme.txt to explain some details, and a jar file that contains the compiled class files. The programs will need Java 1.6 to run. |