the fastest way to parse large file (~2 Gb) delimited with Tab
Please, guide me: is there more efficient way to perform such tasks:
- Read Key-Value pairs from the first tab delimited file into map container
- While reading from the second tab delimited file (Key-Value pairs), writing to the third one
It took 3700 seconds to perform these two steps for fname1 and fname2 with 7,000,000 lines.
#include <string>
#include <map>
#include <fstream>
string col1;
int x;
map<string, int> m;
ifstream is1(fname1, ios::in | ios::binary);
ifstream is2(fname2, ios::in | ios::binary);
ofstream os3(fname3, ios::out | ios::binary);
if (is1.is_open())
{
while (is1 >> col1 >> x)
{
m[col1] = x;
}
is.close();
}
if (is2.is_open() && os3.is_open())
{
while (is2 >> col1 >> x)
{
if (m.count(col1) > 0)
x += m[col1];
os3 << col1 << "t" << x << endl;
}
is2.close();
os3.close();
}
What did I do wrong? Is there more efficient way to perform such tasks?
Or file I/O are a bottlenecks in the most cases?
UPDATED: Here I put two implementations of the same algorithm. The main question: why pythonic version works faster? I've decided to switch towards C++, because I've heard it provide faster code. Was I wrong ?
fname1, fname2 - input. fname3 - desired output.
fname1:
col1 col2 col3
1 1 1
2 2 2
fname2:
col1 col2 col3
1 1 2
3 3 3
fname3:
col1 col2 col3
1 1 3
2 2 2
3 3 3
def merge_two_files(fname1, fname2, fname3):
fout=open(fname3,'w')
fin1=open(fname1)
d1=dict()
for line in fin1:
l=line.strip().split('t')
key='t'.join(l[0:2])
d1[key] = float(l[2])
fin1.close()
d2=dict()
fin2=open(fname2)
for line in fin2:
l=line.strip().split('t')
key='t'.join(l[0:2])
d2[key] = float(l[2])
fin2.close()
for e in d1.viewkeys() & d2.viewkeys():
line_out='t'.join([e,'{:.2f}'.format(d1[e]+d2[e])])
fout.write(line_out+'n')
for e in d1.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d1[e])])
fout.write(line_out+'n')
for e in d2.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d2[e])])
fout.write(line_out+'n')
#include <fstream>
#include <string>
#include <unordered_map>
#include <set>
using namespace std;
int main() {
unordered_map < string, float > map1, map2 ;
set < string > s1, s2, both ;
string col1, col2, key, fname1, fname2, fname3 ;
float col3 ;
ifstream f1 ( fname1, ios::in | ios::binary) ;
ifstream f2 ( fname2, ios::in | ios::binary) ;
ofstream f3 ( fname3, ios::out | ios::binary) ;
if ( f1.is_open() ) {
while ( f1 >> col1 >> col2 >> col3 )
key= col1 + "t" + col2 ;
map1.insert(make_pair(key,col3)) ;
s1.insert(key) ;
}
f1.close()
if ( f2.is_open() ) {
while ( f2 >> col1 >> col2 >> col3 ) {
key= col1 + "t" + col2 ;
map2.insert(make_pair(key,col3)) ;
s2.insert(key) ;
}
}
f2.close() ;
set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
inserter(both, both.begin())) ;
if ( f3.is_open() ) {
for ( const auto& e : both ) {
f3 << e << "t" << map1.at(e) + map2.at(e) << "n" ;
}
for ( const auto& kv : map1 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
for ( const auto& kv : map2 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
}
f3.close() ;
return 0;
}
c++ file io
add a comment |
Please, guide me: is there more efficient way to perform such tasks:
- Read Key-Value pairs from the first tab delimited file into map container
- While reading from the second tab delimited file (Key-Value pairs), writing to the third one
It took 3700 seconds to perform these two steps for fname1 and fname2 with 7,000,000 lines.
#include <string>
#include <map>
#include <fstream>
string col1;
int x;
map<string, int> m;
ifstream is1(fname1, ios::in | ios::binary);
ifstream is2(fname2, ios::in | ios::binary);
ofstream os3(fname3, ios::out | ios::binary);
if (is1.is_open())
{
while (is1 >> col1 >> x)
{
m[col1] = x;
}
is.close();
}
if (is2.is_open() && os3.is_open())
{
while (is2 >> col1 >> x)
{
if (m.count(col1) > 0)
x += m[col1];
os3 << col1 << "t" << x << endl;
}
is2.close();
os3.close();
}
What did I do wrong? Is there more efficient way to perform such tasks?
Or file I/O are a bottlenecks in the most cases?
UPDATED: Here I put two implementations of the same algorithm. The main question: why pythonic version works faster? I've decided to switch towards C++, because I've heard it provide faster code. Was I wrong ?
fname1, fname2 - input. fname3 - desired output.
fname1:
col1 col2 col3
1 1 1
2 2 2
fname2:
col1 col2 col3
1 1 2
3 3 3
fname3:
col1 col2 col3
1 1 3
2 2 2
3 3 3
def merge_two_files(fname1, fname2, fname3):
fout=open(fname3,'w')
fin1=open(fname1)
d1=dict()
for line in fin1:
l=line.strip().split('t')
key='t'.join(l[0:2])
d1[key] = float(l[2])
fin1.close()
d2=dict()
fin2=open(fname2)
for line in fin2:
l=line.strip().split('t')
key='t'.join(l[0:2])
d2[key] = float(l[2])
fin2.close()
for e in d1.viewkeys() & d2.viewkeys():
line_out='t'.join([e,'{:.2f}'.format(d1[e]+d2[e])])
fout.write(line_out+'n')
for e in d1.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d1[e])])
fout.write(line_out+'n')
for e in d2.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d2[e])])
fout.write(line_out+'n')
#include <fstream>
#include <string>
#include <unordered_map>
#include <set>
using namespace std;
int main() {
unordered_map < string, float > map1, map2 ;
set < string > s1, s2, both ;
string col1, col2, key, fname1, fname2, fname3 ;
float col3 ;
ifstream f1 ( fname1, ios::in | ios::binary) ;
ifstream f2 ( fname2, ios::in | ios::binary) ;
ofstream f3 ( fname3, ios::out | ios::binary) ;
if ( f1.is_open() ) {
while ( f1 >> col1 >> col2 >> col3 )
key= col1 + "t" + col2 ;
map1.insert(make_pair(key,col3)) ;
s1.insert(key) ;
}
f1.close()
if ( f2.is_open() ) {
while ( f2 >> col1 >> col2 >> col3 ) {
key= col1 + "t" + col2 ;
map2.insert(make_pair(key,col3)) ;
s2.insert(key) ;
}
}
f2.close() ;
set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
inserter(both, both.begin())) ;
if ( f3.is_open() ) {
for ( const auto& e : both ) {
f3 << e << "t" << map1.at(e) + map2.at(e) << "n" ;
}
for ( const auto& kv : map1 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
for ( const auto& kv : map2 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
}
f3.close() ;
return 0;
}
c++ file io
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
2
Replaceendl
with'n'
.endl
flushes the file each time and is very inefficient.
– Paul Sanders
Jan 1 at 22:32
I've replacedendl
with'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.
– Ivan Vodopyanov
Jan 5 at 22:01
@1201ProgramAlarm,-O3
or-Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?
– Ivan Vodopyanov
Jan 5 at 22:05
add a comment |
Please, guide me: is there more efficient way to perform such tasks:
- Read Key-Value pairs from the first tab delimited file into map container
- While reading from the second tab delimited file (Key-Value pairs), writing to the third one
It took 3700 seconds to perform these two steps for fname1 and fname2 with 7,000,000 lines.
#include <string>
#include <map>
#include <fstream>
string col1;
int x;
map<string, int> m;
ifstream is1(fname1, ios::in | ios::binary);
ifstream is2(fname2, ios::in | ios::binary);
ofstream os3(fname3, ios::out | ios::binary);
if (is1.is_open())
{
while (is1 >> col1 >> x)
{
m[col1] = x;
}
is.close();
}
if (is2.is_open() && os3.is_open())
{
while (is2 >> col1 >> x)
{
if (m.count(col1) > 0)
x += m[col1];
os3 << col1 << "t" << x << endl;
}
is2.close();
os3.close();
}
What did I do wrong? Is there more efficient way to perform such tasks?
Or file I/O are a bottlenecks in the most cases?
UPDATED: Here I put two implementations of the same algorithm. The main question: why pythonic version works faster? I've decided to switch towards C++, because I've heard it provide faster code. Was I wrong ?
fname1, fname2 - input. fname3 - desired output.
fname1:
col1 col2 col3
1 1 1
2 2 2
fname2:
col1 col2 col3
1 1 2
3 3 3
fname3:
col1 col2 col3
1 1 3
2 2 2
3 3 3
def merge_two_files(fname1, fname2, fname3):
fout=open(fname3,'w')
fin1=open(fname1)
d1=dict()
for line in fin1:
l=line.strip().split('t')
key='t'.join(l[0:2])
d1[key] = float(l[2])
fin1.close()
d2=dict()
fin2=open(fname2)
for line in fin2:
l=line.strip().split('t')
key='t'.join(l[0:2])
d2[key] = float(l[2])
fin2.close()
for e in d1.viewkeys() & d2.viewkeys():
line_out='t'.join([e,'{:.2f}'.format(d1[e]+d2[e])])
fout.write(line_out+'n')
for e in d1.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d1[e])])
fout.write(line_out+'n')
for e in d2.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d2[e])])
fout.write(line_out+'n')
#include <fstream>
#include <string>
#include <unordered_map>
#include <set>
using namespace std;
int main() {
unordered_map < string, float > map1, map2 ;
set < string > s1, s2, both ;
string col1, col2, key, fname1, fname2, fname3 ;
float col3 ;
ifstream f1 ( fname1, ios::in | ios::binary) ;
ifstream f2 ( fname2, ios::in | ios::binary) ;
ofstream f3 ( fname3, ios::out | ios::binary) ;
if ( f1.is_open() ) {
while ( f1 >> col1 >> col2 >> col3 )
key= col1 + "t" + col2 ;
map1.insert(make_pair(key,col3)) ;
s1.insert(key) ;
}
f1.close()
if ( f2.is_open() ) {
while ( f2 >> col1 >> col2 >> col3 ) {
key= col1 + "t" + col2 ;
map2.insert(make_pair(key,col3)) ;
s2.insert(key) ;
}
}
f2.close() ;
set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
inserter(both, both.begin())) ;
if ( f3.is_open() ) {
for ( const auto& e : both ) {
f3 << e << "t" << map1.at(e) + map2.at(e) << "n" ;
}
for ( const auto& kv : map1 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
for ( const auto& kv : map2 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
}
f3.close() ;
return 0;
}
c++ file io
Please, guide me: is there more efficient way to perform such tasks:
- Read Key-Value pairs from the first tab delimited file into map container
- While reading from the second tab delimited file (Key-Value pairs), writing to the third one
It took 3700 seconds to perform these two steps for fname1 and fname2 with 7,000,000 lines.
#include <string>
#include <map>
#include <fstream>
string col1;
int x;
map<string, int> m;
ifstream is1(fname1, ios::in | ios::binary);
ifstream is2(fname2, ios::in | ios::binary);
ofstream os3(fname3, ios::out | ios::binary);
if (is1.is_open())
{
while (is1 >> col1 >> x)
{
m[col1] = x;
}
is.close();
}
if (is2.is_open() && os3.is_open())
{
while (is2 >> col1 >> x)
{
if (m.count(col1) > 0)
x += m[col1];
os3 << col1 << "t" << x << endl;
}
is2.close();
os3.close();
}
What did I do wrong? Is there more efficient way to perform such tasks?
Or file I/O are a bottlenecks in the most cases?
UPDATED: Here I put two implementations of the same algorithm. The main question: why pythonic version works faster? I've decided to switch towards C++, because I've heard it provide faster code. Was I wrong ?
fname1, fname2 - input. fname3 - desired output.
fname1:
col1 col2 col3
1 1 1
2 2 2
fname2:
col1 col2 col3
1 1 2
3 3 3
fname3:
col1 col2 col3
1 1 3
2 2 2
3 3 3
def merge_two_files(fname1, fname2, fname3):
fout=open(fname3,'w')
fin1=open(fname1)
d1=dict()
for line in fin1:
l=line.strip().split('t')
key='t'.join(l[0:2])
d1[key] = float(l[2])
fin1.close()
d2=dict()
fin2=open(fname2)
for line in fin2:
l=line.strip().split('t')
key='t'.join(l[0:2])
d2[key] = float(l[2])
fin2.close()
for e in d1.viewkeys() & d2.viewkeys():
line_out='t'.join([e,'{:.2f}'.format(d1[e]+d2[e])])
fout.write(line_out+'n')
for e in d1.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d1[e])])
fout.write(line_out+'n')
for e in d2.viewkeys() - (d1.viewkeys() & d2.viewkeys())
line_out='t'.join([e,'{:.2f}'.format(d2[e])])
fout.write(line_out+'n')
#include <fstream>
#include <string>
#include <unordered_map>
#include <set>
using namespace std;
int main() {
unordered_map < string, float > map1, map2 ;
set < string > s1, s2, both ;
string col1, col2, key, fname1, fname2, fname3 ;
float col3 ;
ifstream f1 ( fname1, ios::in | ios::binary) ;
ifstream f2 ( fname2, ios::in | ios::binary) ;
ofstream f3 ( fname3, ios::out | ios::binary) ;
if ( f1.is_open() ) {
while ( f1 >> col1 >> col2 >> col3 )
key= col1 + "t" + col2 ;
map1.insert(make_pair(key,col3)) ;
s1.insert(key) ;
}
f1.close()
if ( f2.is_open() ) {
while ( f2 >> col1 >> col2 >> col3 ) {
key= col1 + "t" + col2 ;
map2.insert(make_pair(key,col3)) ;
s2.insert(key) ;
}
}
f2.close() ;
set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
inserter(both, both.begin())) ;
if ( f3.is_open() ) {
for ( const auto& e : both ) {
f3 << e << "t" << map1.at(e) + map2.at(e) << "n" ;
}
for ( const auto& kv : map1 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
for ( const auto& kv : map2 ) {
if ( both.count(kv.first) ) continue ;
f3 << kv.first << "t" << kv.second << "n" ;
}
}
f3.close() ;
return 0;
}
c++ file io
c++ file io
edited Jan 9 at 22:53
Ivan Vodopyanov
asked Jan 1 at 20:23
Ivan VodopyanovIvan Vodopyanov
95
95
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
2
Replaceendl
with'n'
.endl
flushes the file each time and is very inefficient.
– Paul Sanders
Jan 1 at 22:32
I've replacedendl
with'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.
– Ivan Vodopyanov
Jan 5 at 22:01
@1201ProgramAlarm,-O3
or-Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?
– Ivan Vodopyanov
Jan 5 at 22:05
add a comment |
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
2
Replaceendl
with'n'
.endl
flushes the file each time and is very inefficient.
– Paul Sanders
Jan 1 at 22:32
I've replacedendl
with'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.
– Ivan Vodopyanov
Jan 5 at 22:01
@1201ProgramAlarm,-O3
or-Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?
– Ivan Vodopyanov
Jan 5 at 22:05
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
2
2
Replace
endl
with 'n'
. endl
flushes the file each time and is very inefficient.– Paul Sanders
Jan 1 at 22:32
Replace
endl
with 'n'
. endl
flushes the file each time and is very inefficient.– Paul Sanders
Jan 1 at 22:32
I've replaced
endl
with 'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.– Ivan Vodopyanov
Jan 5 at 22:01
I've replaced
endl
with 'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.– Ivan Vodopyanov
Jan 5 at 22:01
@1201ProgramAlarm,
-O3
or -Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?– Ivan Vodopyanov
Jan 5 at 22:05
@1201ProgramAlarm,
-O3
or -Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?– Ivan Vodopyanov
Jan 5 at 22:05
add a comment |
2 Answers
2
active
oldest
votes
At least in the limit of large number of different keys in these files (I assume you come close enough here), the limiting factor will be the map
key lookup, not the IO. Looking up a key in a std::map
has time complexity O(ln n)
where n
is the number of different keys in the container.
Use std::unordered_map
which has amortized O(1)
key lookup time on average.
Use a profiler to tell you where most of the time is spent, before you begin optimizing on wrong assumptions. Do the same in this case as well to verify whether I am correct.
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to theunordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g.perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.
– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
add a comment |
The Bottlenecks
Your bottlenecks are 1) file I/O and 2) format conversion. A file may require overhead such as spinning up (hard drives), seek time and actual read time.
Streaming the Data
Whether the file is located on a Flash drive or a hard drive, streaming the data is most efficient (that is, keep the data flowing). For example, reading 1k of data in 1 transaction is more efficient than reading 1k of data in 1k transactions. Your most efficient method here is to read large quantities into memory. You may want to check your OS to see if it supports memory mapping.
Converting the Data
Your next bottleneck is parsing and converting the data from textual to internal numeric format. It takes time. To speed this up, have your data written in a binary (internal) representation. This may not be possible due to Endianess or floating point issues.
Storing the Data
I guess the next bottleneck is to store the internal formatted data into a data structure. For key, value, pairs, you may want to use std::map
or std::unordered_map
. If you don't care about search time, you may want to create a struct
with the values and store into an array (or std::vector
if the quantity is not known at compile time).
Profiling
The best way to find out where your bottlenecks are is to profile. Run your application 1E6 repetitions and take the average (you may need to run 1E9 to get better accuracy). Search the internet for "Benchmarking C++ programs". Benchmarking will show you how to get better results.
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to usestruct
andvector
?
– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". Astruct
orclass
is good at modeling the data. You can place all the data into avector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into thevector
. The index table provides faster search times, than resorting the vector each time.
– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (vector
or array). The vector will allocate room for X items. When youpush_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).
– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53998679%2fthe-fastest-way-to-parse-large-file-2-gb-delimited-with-tab%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
At least in the limit of large number of different keys in these files (I assume you come close enough here), the limiting factor will be the map
key lookup, not the IO. Looking up a key in a std::map
has time complexity O(ln n)
where n
is the number of different keys in the container.
Use std::unordered_map
which has amortized O(1)
key lookup time on average.
Use a profiler to tell you where most of the time is spent, before you begin optimizing on wrong assumptions. Do the same in this case as well to verify whether I am correct.
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to theunordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g.perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.
– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
add a comment |
At least in the limit of large number of different keys in these files (I assume you come close enough here), the limiting factor will be the map
key lookup, not the IO. Looking up a key in a std::map
has time complexity O(ln n)
where n
is the number of different keys in the container.
Use std::unordered_map
which has amortized O(1)
key lookup time on average.
Use a profiler to tell you where most of the time is spent, before you begin optimizing on wrong assumptions. Do the same in this case as well to verify whether I am correct.
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to theunordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g.perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.
– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
add a comment |
At least in the limit of large number of different keys in these files (I assume you come close enough here), the limiting factor will be the map
key lookup, not the IO. Looking up a key in a std::map
has time complexity O(ln n)
where n
is the number of different keys in the container.
Use std::unordered_map
which has amortized O(1)
key lookup time on average.
Use a profiler to tell you where most of the time is spent, before you begin optimizing on wrong assumptions. Do the same in this case as well to verify whether I am correct.
At least in the limit of large number of different keys in these files (I assume you come close enough here), the limiting factor will be the map
key lookup, not the IO. Looking up a key in a std::map
has time complexity O(ln n)
where n
is the number of different keys in the container.
Use std::unordered_map
which has amortized O(1)
key lookup time on average.
Use a profiler to tell you where most of the time is spent, before you begin optimizing on wrong assumptions. Do the same in this case as well to verify whether I am correct.
edited Jan 1 at 20:38
answered Jan 1 at 20:32
user10605163user10605163
2,868624
2,868624
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to theunordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g.perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.
– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
add a comment |
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to theunordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g.perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.
– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
Thank you for the rapid answer. Your idea is great. In fact, I do not need sorted keys to look up for the key. But the program accelerated a little bit. I've implemented the same algorithm with Python: it took 195 seconds versus 1152 seconds with C++. I do not understand why?
– Ivan Vodopyanov
Jan 5 at 21:54
@IvanVodopyanov If you changed to the
unordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g. perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.– user10605163
Jan 6 at 5:30
@IvanVodopyanov If you changed to the
unordered_map
then in terms of average asymptotic time complexity there is nothing more to improve really. The rest comes down to improving the linear time factor. ThomasMatthews gave a more comprehensive list of possible such bottlenecks. Use a profiler (e.g. perf
on linux) to figure out in which of the categories your program spends most time. You could also add the python version to the question, so that we can verify whether the programs indeed do the same.– user10605163
Jan 6 at 5:30
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
@IvanVodopyanov You could also add a python script (or other) generating the input files, then anyone would potentially be able to benchmark/profile it themselves (if anyone has time to do so)
– user10605163
Jan 6 at 5:32
add a comment |
The Bottlenecks
Your bottlenecks are 1) file I/O and 2) format conversion. A file may require overhead such as spinning up (hard drives), seek time and actual read time.
Streaming the Data
Whether the file is located on a Flash drive or a hard drive, streaming the data is most efficient (that is, keep the data flowing). For example, reading 1k of data in 1 transaction is more efficient than reading 1k of data in 1k transactions. Your most efficient method here is to read large quantities into memory. You may want to check your OS to see if it supports memory mapping.
Converting the Data
Your next bottleneck is parsing and converting the data from textual to internal numeric format. It takes time. To speed this up, have your data written in a binary (internal) representation. This may not be possible due to Endianess or floating point issues.
Storing the Data
I guess the next bottleneck is to store the internal formatted data into a data structure. For key, value, pairs, you may want to use std::map
or std::unordered_map
. If you don't care about search time, you may want to create a struct
with the values and store into an array (or std::vector
if the quantity is not known at compile time).
Profiling
The best way to find out where your bottlenecks are is to profile. Run your application 1E6 repetitions and take the average (you may need to run 1E9 to get better accuracy). Search the internet for "Benchmarking C++ programs". Benchmarking will show you how to get better results.
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to usestruct
andvector
?
– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". Astruct
orclass
is good at modeling the data. You can place all the data into avector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into thevector
. The index table provides faster search times, than resorting the vector each time.
– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (vector
or array). The vector will allocate room for X items. When youpush_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).
– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
add a comment |
The Bottlenecks
Your bottlenecks are 1) file I/O and 2) format conversion. A file may require overhead such as spinning up (hard drives), seek time and actual read time.
Streaming the Data
Whether the file is located on a Flash drive or a hard drive, streaming the data is most efficient (that is, keep the data flowing). For example, reading 1k of data in 1 transaction is more efficient than reading 1k of data in 1k transactions. Your most efficient method here is to read large quantities into memory. You may want to check your OS to see if it supports memory mapping.
Converting the Data
Your next bottleneck is parsing and converting the data from textual to internal numeric format. It takes time. To speed this up, have your data written in a binary (internal) representation. This may not be possible due to Endianess or floating point issues.
Storing the Data
I guess the next bottleneck is to store the internal formatted data into a data structure. For key, value, pairs, you may want to use std::map
or std::unordered_map
. If you don't care about search time, you may want to create a struct
with the values and store into an array (or std::vector
if the quantity is not known at compile time).
Profiling
The best way to find out where your bottlenecks are is to profile. Run your application 1E6 repetitions and take the average (you may need to run 1E9 to get better accuracy). Search the internet for "Benchmarking C++ programs". Benchmarking will show you how to get better results.
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to usestruct
andvector
?
– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". Astruct
orclass
is good at modeling the data. You can place all the data into avector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into thevector
. The index table provides faster search times, than resorting the vector each time.
– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (vector
or array). The vector will allocate room for X items. When youpush_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).
– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
add a comment |
The Bottlenecks
Your bottlenecks are 1) file I/O and 2) format conversion. A file may require overhead such as spinning up (hard drives), seek time and actual read time.
Streaming the Data
Whether the file is located on a Flash drive or a hard drive, streaming the data is most efficient (that is, keep the data flowing). For example, reading 1k of data in 1 transaction is more efficient than reading 1k of data in 1k transactions. Your most efficient method here is to read large quantities into memory. You may want to check your OS to see if it supports memory mapping.
Converting the Data
Your next bottleneck is parsing and converting the data from textual to internal numeric format. It takes time. To speed this up, have your data written in a binary (internal) representation. This may not be possible due to Endianess or floating point issues.
Storing the Data
I guess the next bottleneck is to store the internal formatted data into a data structure. For key, value, pairs, you may want to use std::map
or std::unordered_map
. If you don't care about search time, you may want to create a struct
with the values and store into an array (or std::vector
if the quantity is not known at compile time).
Profiling
The best way to find out where your bottlenecks are is to profile. Run your application 1E6 repetitions and take the average (you may need to run 1E9 to get better accuracy). Search the internet for "Benchmarking C++ programs". Benchmarking will show you how to get better results.
The Bottlenecks
Your bottlenecks are 1) file I/O and 2) format conversion. A file may require overhead such as spinning up (hard drives), seek time and actual read time.
Streaming the Data
Whether the file is located on a Flash drive or a hard drive, streaming the data is most efficient (that is, keep the data flowing). For example, reading 1k of data in 1 transaction is more efficient than reading 1k of data in 1k transactions. Your most efficient method here is to read large quantities into memory. You may want to check your OS to see if it supports memory mapping.
Converting the Data
Your next bottleneck is parsing and converting the data from textual to internal numeric format. It takes time. To speed this up, have your data written in a binary (internal) representation. This may not be possible due to Endianess or floating point issues.
Storing the Data
I guess the next bottleneck is to store the internal formatted data into a data structure. For key, value, pairs, you may want to use std::map
or std::unordered_map
. If you don't care about search time, you may want to create a struct
with the values and store into an array (or std::vector
if the quantity is not known at compile time).
Profiling
The best way to find out where your bottlenecks are is to profile. Run your application 1E6 repetitions and take the average (you may need to run 1E9 to get better accuracy). Search the internet for "Benchmarking C++ programs". Benchmarking will show you how to get better results.
answered Jan 1 at 21:43
Thomas MatthewsThomas Matthews
44.6k1174123
44.6k1174123
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to usestruct
andvector
?
– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". Astruct
orclass
is good at modeling the data. You can place all the data into avector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into thevector
. The index table provides faster search times, than resorting the vector each time.
– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (vector
or array). The vector will allocate room for X items. When youpush_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).
– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
add a comment |
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to usestruct
andvector
?
– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". Astruct
orclass
is good at modeling the data. You can place all the data into avector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into thevector
. The index table provides faster search times, than resorting the vector each time.
– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (vector
or array). The vector will allocate room for X items. When youpush_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).
– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to use
struct
and vector
?– Ivan Vodopyanov
Jan 5 at 22:00
Thank you, Thomas. I did not understand 'Storing the Data' section. Of course, I do care about search time, so you've advised me not to use
struct
and vector
?– Ivan Vodopyanov
Jan 5 at 22:00
All depends. Search the internet for "database theory index tables". A
struct
or class
is good at modeling the data. You can place all the data into a vector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into the vector
. The index table provides faster search times, than resorting the vector each time.– Thomas Matthews
Jan 5 at 22:25
All depends. Search the internet for "database theory index tables". A
struct
or class
is good at modeling the data. You can place all the data into a vector
, but unsorted vectors don't have good search times. In the database arena, all the data is placed into a table (a.k.a. vector), then index tables are built. The index tables (std::map
) contain the search key and the index into the vector
. The index table provides faster search times, than resorting the vector each time.– Thomas Matthews
Jan 5 at 22:25
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (
vector
or array). The vector will allocate room for X items. When you push_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).– Thomas Matthews
Jan 5 at 22:29
If you don't know the amount of data at compile time, another bottleneck will be expanding of the database (
vector
or array). The vector will allocate room for X items. When you push_back()
after the X item, the vector needs to expand. Expanding involves allocating new, larger, memory, the copying the old memory to the new memory; all taking up time. You can reduce this by having your data source specify the number of records in the header section of the data file (if possible).– Thomas Matthews
Jan 5 at 22:29
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
I have often sped up programs by block reading from the file into a buffer. The blocks would be around 1MB. Another layer of efficiency is to use multiple buffers and threads. The reader thread reads a buffer, then signals the computation thread. The reader then starts on the next buffer. This allows the reader to continuously operate, which the computation thread executes. The amount and sizes of the buffers would need to be adjusted to the optimal efficiency in your situation (one size does not fit all). Search the web for "double buffering multiple buffers"
– Thomas Matthews
Jan 5 at 22:33
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53998679%2fthe-fastest-way-to-parse-large-file-2-gb-delimited-with-tab%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Are you using an optimized build? Do you have enough RAM to hold all that content in memory without using the paging file?
– 1201ProgramAlarm
Jan 1 at 22:18
2
Replace
endl
with'n'
.endl
flushes the file each time and is very inefficient.– Paul Sanders
Jan 1 at 22:32
I've replaced
endl
with'n'
, but the program accelerated a little, but thank you anyway - I've got more familiar with streams.– Ivan Vodopyanov
Jan 5 at 22:01
@1201ProgramAlarm,
-O3
or-Ofast
. I guess, yes. How can I check with PowerShell: how much RAM available before OS starts to use paging file?– Ivan Vodopyanov
Jan 5 at 22:05