diff options
Diffstat (limited to 'src/connectors/src/data/io/csv')
-rw-r--r-- | src/connectors/src/data/io/csv/CSVDataReader.java | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/src/connectors/src/data/io/csv/CSVDataReader.java b/src/connectors/src/data/io/csv/CSVDataReader.java new file mode 100644 index 0000000..6dbc8ff --- /dev/null +++ b/src/connectors/src/data/io/csv/CSVDataReader.java @@ -0,0 +1,248 @@ +/* + * SSSync, a Simple and Stupid Synchronizer for data with multi-valued attributes + * Copyright (C) 2014 Ludovic Pouzenc <ludovic@pouzenc.fr> + * + * This file is part of SSSync. + * + * SSSync is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SSSync is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SSSync. If not, see <http://www.gnu.org/licenses/> + */ +package data.io.csv; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; + +import data.MVDataEntry; +import data.io.AbstractMVDataReader; + +/** + * Stream-oriented reader from a particular CSV file. + * Always returns lines/items sorted by lexicographical ascending key. + * + * @author lpouzenc + */ +public class CSVDataReader extends AbstractMVDataReader { + + public static final String CSV_DEMO = + //"key,attr,values\n" + + "line3,hello,all;the;others\n" + + "line1,from,csv1;csv1bis\n" + + "line2,hello,all;the;world\n" + + "line1,attr2,csv1\n" + + ",,\n"; + + public static final CSVFormat DEFAULT_CSV_FORMAT = CSVFormat.EXCEL + .withHeader("key","attr","values") + .withIgnoreSurroundingSpaces(true); + + private final CSVFormat format; + private final Reader dataSourceStream; + + private transient MVDataEntry nextEntry; + private transient CSVRecord nextCSVRecord; + private transient Iterator<CSVRecord> csvIt; + + + /** + * Constructs a CSVDataReader object for parsing a CSV input given via dataSourceStream. + * @param dataSourceName A short string representing this reader (for logging) + * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader + * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! + * @param format Specify the exact format used to encode the CSV file (separators, escaping...) + * @throws IOException + */ + public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted, CSVFormat format) throws IOException { + this.dataSourceName = dataSourceName; + this.format = format; + + if ( alreadySorted ) { + this.dataSourceStream = dataSourceStream; + } else { + BufferedReader bufReader; + if ( dataSourceStream instanceof BufferedReader ) { + bufReader = (BufferedReader) dataSourceStream; + } else { + bufReader = new BufferedReader(dataSourceStream); + } + this.dataSourceStream = readAndSortLines(bufReader); + } + } + + /** + * Constructs a CSVDataReader object with default CSV format (for CSVParser). + * @param dataSourceName A short string representing this reader (for logging) + * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader + * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! + * @throws IOException + */ + public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted) throws IOException { + this(dataSourceName, dataSourceStream, alreadySorted, DEFAULT_CSV_FORMAT); + } + + /** + * {@inheritDoc} + * Note : multiple iterators on the same instance are not supported + */ + @Override + public Iterator<MVDataEntry> iterator() { + // When a new iterator is requested, everything should be reset + CSVParser parser; + try { + dataSourceStream.reset(); + parser = new CSVParser(dataSourceStream, format); + } catch (IOException e) { + throw new RuntimeException(e); + } + csvIt = parser.iterator(); + nextCSVRecord = null; + nextEntry = null; + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean hasNext() { + if ( nextEntry == null ) { + lookAhead(); + } + return ( nextEntry != null ); + } + + /** + * {@inheritDoc} + */ + @Override + public MVDataEntry next() { + if ( !hasNext() ) { + throw new NoSuchElementException(); + } + // Pop the lookahead record + MVDataEntry res = nextEntry; + nextEntry=null; + // And return it + return res; + } + + /** + * In-memory File sorting, return as a single String + * @param reader + * @return + * @throws IOException + */ + private Reader readAndSortLines(BufferedReader bufReader) throws IOException { + // Put all the CSV in memory, in a SortedSet + SortedSet<String> lineSet = new TreeSet<String>(); + String inputLine; + int totalCSVSize=0; + while ((inputLine = bufReader.readLine()) != null) { + lineSet.add(inputLine); + totalCSVSize += inputLine.length() + 1; + } + bufReader.close(); // Closes also dataSourceStream + + // Put all sorted lines in a String + StringBuilder allLines = new StringBuilder(totalCSVSize); + for ( String line: lineSet) { + allLines.append(line + "\n"); + } + lineSet = null; // Could help the GC if the input file is huge + + // Build a Java Reader from that String + return new StringReader(allLines.toString()); + } + + /** + * A MVDataEntry could be represented on many CSV lines. + * The key is repeated, the attr could change, the values should change (for given key/attr pair) + */ + private void lookAhead() { + MVDataEntry currEntry = null; + + boolean abort=(nextCSVRecord==null && !csvIt.hasNext()); // Nothing to crunch + boolean done=(nextEntry!=null); // Already looked ahead + while (!abort && !done) { + // Try to get a valid CSVRecord + if ( nextCSVRecord == null ) { + nextCSVRecord = nextValidCSVRecord(); + } + // If no more CSV data + if ( nextCSVRecord == null ) { + // Maybe we have a remaining entry to return + if ( currEntry != null ) { + done=true; continue; + } else { + abort=true; continue; + } + } + + // Now we have a valid CSV line to put in a MVDataEntry + String newKey = nextCSVRecord.get("key"); + + + // If no MVDataEntry yet, it's time to create it (we have data to put into) + if ( currEntry == null ) { + currEntry = new MVDataEntry(newKey); + } + // If CSV line key matches MVDataEntry key, appends attr/values on it + // XXX Tricky code : following condition is always true if the previous one is true + if ( currEntry.getKey().equals(newKey) ) { + currEntry.splitAndPut(nextCSVRecord.get("attr"), nextCSVRecord.get("values"), ";"); + nextCSVRecord = null; // Record consumed + } else { + // Keys are different, we are done (and we have remaining CSV data in nextCSVRecord) + done=true; continue; + } + } + + nextEntry = done?currEntry:null; + } + + /** + * Seek for the next valid record in the CSV file + * @return the next valid CSVRecord + */ + private CSVRecord nextValidCSVRecord() { + CSVRecord res = null; + boolean abort = !csvIt.hasNext(); + boolean done = false; + while (!abort && !done) { + // Try to read a CSV line + res = (csvIt.hasNext())?csvIt.next():null; + + // Break if nothing readable + if ( res == null ) { + abort=true; continue; + } + + // Skip invalid and empty lines + String key = res.get("key"); + if ( key != null && ! key.isEmpty() ) { + done=true; continue; + } + } + + return done?res:null; + } +} |