开发者

How to improve the speed of this code?

I'm trying to import all googlebooks-1gram files into a postgresql database. I wrote the following Java code for that:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long lastId = -1L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        Long id = -1L;
                        if (lastWord.equals(data[0])) {
                            id = lastId;
                        } else {
                            wordInsert.setString(1, data[0]);
                            wordInsert.executeUpdate();
                            ResultSet resultSet = wordInsert.getGeneratedKeys();
                            if (resultSet != null && resultSet.next()) 
                            {
                                id = resultSet.getLong(1);
                            }
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                        lastId = id;
                    }
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

However, when running this for ~3 hours it only placed 1.000.000 entries in the wordcounts table. When I check the amount of lines in the entire 1gram dataset it's 500.000.000 lines. So to import everything would take about 62.5 days, I can accept that it imports in about a week, but 2 months? I think I'm doing something seriously wrong here(I do have a server that runs 24/7, so I can actually run it for this long, but faster would be nice XD)

EDIT: This code is how I solved it:

public class ToPostgres {

    public static void main(String[] args) throws Exception {
        String filePath = "./";
        List<String> files = new ArrayList<String>();
        for (int i =0; i < 10; i++) {
            files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
        }
        Connection c = null;
        try {
            c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                    "postgres", "xxxxxx");
        } catch (SQLException e) {
            e.printStackTrace();
        }

        if (c != null) {
            c.setAutoCommit(false);
            try {
                PreparedStatement wordInsert = c.prepareStatement(
                    "INSERT INTO words (id, word) VALUES (?,?)"
                );
                PreparedStatement countInsert = c.prepareStatement(
                    "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                    "VALUES (?,?,?,?,?)"
                );
                String lastWord = "";
                Long id = 0L;
                for (String filename: files) {
                    BufferedReader input =  new BufferedReader(new FileReader(new File(filename)));
                    String line = "";
                    int i = 0;
                    while ((line = input.readLine()) != null) {
                        String[] data = line.split("\t");
                        if (!lastWord.equals(data[0])) {
                            id++;
                            wordInsert.setLong(1, id);
                            wordInsert.setString(2, data[0]);
                            wordInsert.executeUpdate();
                        }
                        countInsert.setLong(1, id);
                        countInsert.setInt(2, Integer.parseInt(data[1]));
                        countInsert.setInt(3, Integer.parseInt(data[2]));
                        countInsert.setInt(4, Integer.parseInt(data[3]));
                        countInsert.setInt(5, Integer.parseInt(data[4]));
                        countInsert.executeUpdate();
                        lastWord = data[0];
                      开发者_开发知识库  if (i % 10000 == 0) {
                            c.commit();
                        }
                        if (i % 100000 == 0) {
                            System.out.println(i+" mark file "+filename);
                        }
                        i++;
                    }
                    c.commit();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

I reached 1.5 million rows in about 15 minutes now. That's fast enough for me, thanks all!


JDBC connections have autocommit enabled by default, which carries a per-statement overhead. Try disabling it:

c.setAutoCommit(false)

then commit in batches, something along the lines of:

long ops = 0;

for(String filename : files) {
    // ...
    while ((line = input.readLine()) != null) {
        // insert some stuff...

        ops ++;

        if(ops % 1000 == 0) {
            c.commit();
        }
    }
}

c.commit();


If your table has indexes, it might be faster to delete them, insert the data, and recreate the indexes later.

Setting autocommit off, and doing a manual commit every 10 000 records or so (look into the documentation for a reasonable value - there is some limit) could speed up as well.

Generating the index/foreign key yourself, and keeping track of it should be faster than wordInsert.getGeneratedKeys(); but I'm not sure, whether it is possible from your content.

There is an approach called 'bulk insert'. I don't remember the details, but its a starting point for a search.


Write it to do threading, running 4 threads at the same time, or split it up in sections (read from config file) and distribute it to X machines and have them get the data togeather.


Use batch statements to execute multiple inserts at the same time, rather than one INSERT at a time.

In addition I would remove the part of your algorithm which updates the word count after each insert into the words table, instead just calculate all of the word counts once inserting the words is complete.


Another approach would be to do bulk inserts rather than single inserts. See this question Whats the fastest way to do a bulk insert into Postgres? for more information.


Create threads

String lastWord = "";
    Long lastId = -1L;
    PreparedStatement wordInsert;
    PreparedStatement countInsert ;
    public class ToPostgres {
        public void main(String[] args) throws Exception {
            String filePath = "./";
            List<String> files = new ArrayList<String>();
            for (int i =0; i < 10; i++) {
                files.add(filePath+"googlebooks-eng-all-1gram-20090715-"+i+".csv");
            }
            Connection c = null;
            try {
                c = DriverManager.getConnection("jdbc:postgresql://localhost/googlebooks",
                        "postgres", "xxxxxx");
            } catch (SQLException e) {
                e.printStackTrace();
            }

            if (c != null) {
                try {
                    wordInsert = c.prepareStatement(
                        "INSERT INTO words (word) VALUES (?)", Statement.RETURN_GENERATED_KEYS
                    );
                    countInsert = c.prepareStatement(
                        "INSERT INTO wordcounts (word_id, \"year\", total_count, total_pages, total_books) " +
                        "VALUES (?,?,?,?,?)"
                    );
                    for (String filename: files) {
                        new MyThread(filename). start();
                    }
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }

    }
    class MyThread extends Thread{
        String file;
        public MyThread(String file) {
            this.file = file;
        }
        @Override
        public void run() {         
            try {
                super.run();
                BufferedReader input =  new BufferedReader(new FileReader(new File(file)));
                String line = "";
                while ((line = input.readLine()) != null) {
                    String[] data = line.split("\t");
                    Long id = -1L;
                    if (lastWord.equals(data[0])) {
                        id = lastId;
                    } else {
                        wordInsert.setString(1, data[0]);
                        wordInsert.executeUpdate();
                        ResultSet resultSet = wordInsert.getGeneratedKeys();
                        if (resultSet != null && resultSet.next()) 
                        {
                            id = resultSet.getLong(1);
                        }
                    }
                    countInsert.setLong(1, id);
                    countInsert.setInt(2, Integer.parseInt(data[1]));
                    countInsert.setInt(3, Integer.parseInt(data[2]));
                    countInsert.setInt(4, Integer.parseInt(data[3]));
                    countInsert.setInt(5, Integer.parseInt(data[4]));
                    countInsert.executeUpdate();
                    lastWord = data[0];
                    lastId = id;
                }
            } catch (NumberFormatException e) {
                e.printStackTrace();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜