/* * SSSync, a Simple and Stupid Synchronizer for data with multi-valued attributes * Copyright (C) 2014 Ludovic Pouzenc * * This file is part of SSSync. * * SSSync is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SSSync is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SSSync. If not, see */ package data.io.csv; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.SortedSet; import java.util.TreeSet; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import data.MVDataEntry; import data.io.AbstractMVDataReader; /** * Stream-oriented reader from a particular CSV file. * Always returns lines/items sorted by lexicographical ascending key. * * @author lpouzenc */ public class CSVDataReader extends AbstractMVDataReader { public static final String CSV_DEMO = //"key,attr,values\n" + "line3,hello,all;the;others\n" + "line1,from,csv1;csv1bis\n" + "line2,hello,all;the;world\n" + "line1,attr2,csv1\n" + ",,\n"; public static final CSVFormat DEFAULT_CSV_FORMAT = CSVFormat.EXCEL .withHeader("key","attr","values") .withIgnoreSurroundingSpaces(true); private final CSVFormat format; private final Reader dataSourceStream; private transient MVDataEntry nextEntry; private transient CSVRecord nextCSVRecord; private transient Iterator csvIt; /** * Constructs a CSVDataReader object for parsing a CSV input given via dataSourceStream. * @param dataSourceName A short string representing this reader (for logging) * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! * @param format Specify the exact format used to encode the CSV file (separators, escaping...) * @throws IOException */ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted, CSVFormat format) throws IOException { this.dataSourceName = dataSourceName; this.format = format; if ( alreadySorted ) { this.dataSourceStream = dataSourceStream; } else { BufferedReader bufReader; if ( dataSourceStream instanceof BufferedReader ) { bufReader = (BufferedReader) dataSourceStream; } else { bufReader = new BufferedReader(dataSourceStream); } this.dataSourceStream = readAndSortLines(bufReader); } } /** * Constructs a CSVDataReader object with default CSV format (for CSVParser). * @param dataSourceName A short string representing this reader (for logging) * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! * @throws IOException */ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted) throws IOException { this(dataSourceName, dataSourceStream, alreadySorted, DEFAULT_CSV_FORMAT); } /** * {@inheritDoc} * Note : multiple iterators on the same instance are not supported */ @Override public Iterator iterator() { // When a new iterator is requested, everything should be reset CSVParser parser; try { dataSourceStream.reset(); parser = new CSVParser(dataSourceStream, format); } catch (IOException e) { throw new RuntimeException(e); } csvIt = parser.iterator(); nextCSVRecord = null; nextEntry = null; return this; } /** * {@inheritDoc} */ @Override public boolean hasNext() { if ( nextEntry == null ) { lookAhead(); } return ( nextEntry != null ); } /** * {@inheritDoc} */ @Override public MVDataEntry next() { if ( !hasNext() ) { throw new NoSuchElementException(); } // Pop the lookahead record MVDataEntry res = nextEntry; nextEntry=null; // And return it return res; } /** * In-memory File sorting, return as a single String * @param reader * @return * @throws IOException */ private Reader readAndSortLines(BufferedReader bufReader) throws IOException { // Put all the CSV in memory, in a SortedSet SortedSet lineSet = new TreeSet(); String inputLine; int totalCSVSize=0; while ((inputLine = bufReader.readLine()) != null) { lineSet.add(inputLine); totalCSVSize += inputLine.length() + 1; } bufReader.close(); // Closes also dataSourceStream // Put all sorted lines in a String StringBuilder allLines = new StringBuilder(totalCSVSize); for ( String line: lineSet) { allLines.append(line + "\n"); } lineSet = null; // Could help the GC if the input file is huge // Build a Java Reader from that String return new StringReader(allLines.toString()); } /** * A MVDataEntry could be represented on many CSV lines. * The key is repeated, the attr could change, the values should change (for given key/attr pair) */ private void lookAhead() { MVDataEntry currEntry = null; boolean abort=(nextCSVRecord==null && !csvIt.hasNext()); // Nothing to crunch boolean done=(nextEntry!=null); // Already looked ahead while (!abort && !done) { // Try to get a valid CSVRecord if ( nextCSVRecord == null ) { nextCSVRecord = nextValidCSVRecord(); } // If no more CSV data if ( nextCSVRecord == null ) { // Maybe we have a remaining entry to return if ( currEntry != null ) { done=true; continue; } else { abort=true; continue; } } // Now we have a valid CSV line to put in a MVDataEntry String newKey = nextCSVRecord.get("key"); // If no MVDataEntry yet, it's time to create it (we have data to put into) if ( currEntry == null ) { currEntry = new MVDataEntry(newKey); } // If CSV line key matches MVDataEntry key, appends attr/values on it // XXX Tricky code : following condition is always true if the previous one is true if ( currEntry.getKey().equals(newKey) ) { currEntry.splitAndPut(nextCSVRecord.get("attr"), nextCSVRecord.get("values"), ";"); nextCSVRecord = null; // Record consumed } else { // Keys are different, we are done (and we have remaining CSV data in nextCSVRecord) done=true; continue; } } nextEntry = done?currEntry:null; } /** * Seek for the next valid record in the CSV file * @return the next valid CSVRecord */ private CSVRecord nextValidCSVRecord() { CSVRecord res = null; boolean abort = !csvIt.hasNext(); boolean done = false; while (!abort && !done) { // Try to read a CSV line res = (csvIt.hasNext())?csvIt.next():null; // Break if nothing readable if ( res == null ) { abort=true; continue; } // Skip invalid and empty lines String key = res.get("key"); if ( key != null && ! key.isEmpty() ) { done=true; continue; } } return done?res:null; } }