Concat DataFrame Reindexing only valid with uniquely valued Index objects
I am trying to concat the following:
df1
price side timestamp
timestamp
2016-01-04 00:01:15.631331072 0.7286 2 1451865675631331
2016-01-04 00:01:15.631399936 0.7286 2 1451865675631400
2016-01-04 00:01:15.631860992 0.7286 2 1451865675631861
2016-01-04 00:01:15.631866112 0.7286 2 1451865675631866
and
df2
bid bid_size offer offer_size
timestamp
2016-01-04 00:00:31.331441920 0.7284 4000000 0.7285 1000000
2016-01-04 00:00:53.631324928 0.7284 4000000 0.7290 4000000
2016-01-04 00:01:03.131234048 0.7284 5000000 0.7286 4000000
2016-01-04 00:01:12.131444992 0.7285 1000000 0.7286 4000000
2016-01-04 00:01:15.631364096 0.7285 4000000 0.7290 4000000
With
data = pd.concat([df1,df2], axis=1)
But I get the follwing output:
InvalidIndexError Traceback (most recent call last)
<ipython-input-38-2e88458f01d7> in <module>()
----> 1 data = pd.concat([df1,df2], axis=1)
2 data = data.fillna(method='pad')
3 data = data.fillna(method='bfill')
4 data['timestamp'] = data.index.values#converting to datetime
5 data['timestamp'] = pd.to_datetime(data['timestamp'])#converting to datetime
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
810 keys=keys, levels=levels, names=names,
811 verify_integrity=verify_integrity,
--> 812 copy=copy)
813 return op.get_result()
814
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
947 self.copy = copy
948
--> 949 self.new_axes = self._get_new_axes()
950
951 def get_result(self):
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
1013 if i == self.axis:
1014 continue
-> 1015 new_axes[i] = self._get_comb_axis(i)
1016 else:
1017 if len(self.join_axes) != ndim - 1:
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
1039 raise TypeError("Cannot concatenate list of %s" % types)
1040
-> 1041 return _get_combined_index(all_indexes, intersect=self.intersect)
1042
1043 def _get_concat_axis(self):
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
6120 index = index.intersection(other)
6121 return index
-> 6122 union = _union_indexes(indexes)
6123 return _ensure_index(union)
6124
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
6149
6150 if hasattr(result, 'union_many'):
-> 6151 return result.union_many(indexes[1:])
6152 else:
6153 for other in indexes[1:]:
/usr/local/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
959 else:
960 tz = this.tz
--> 961 this = Index.union(this, other)
962 if isinstance(this, DatetimeIndex):
963 this.tz = tz
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1553 result.extend([x for x in other._values if x not in value_set])
1554 else:
-> 1555 indexer = self.get_indexer(other)
1556 indexer, = (indexer == -1).nonzero()
1557
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit, tolerance)
1890
1891 if not self.is_unique:
-> 1892 raise InvalidIndexError('Reindexing only valid with uniquely'
1893 ' valued Index objects')
1894
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I have removed additional columns and removed duplicates and NA where there could be a conflict - but I simply do not know whats wrong.
Please help
Thanks
python numpy pandas
add a comment |
I am trying to concat the following:
df1
price side timestamp
timestamp
2016-01-04 00:01:15.631331072 0.7286 2 1451865675631331
2016-01-04 00:01:15.631399936 0.7286 2 1451865675631400
2016-01-04 00:01:15.631860992 0.7286 2 1451865675631861
2016-01-04 00:01:15.631866112 0.7286 2 1451865675631866
and
df2
bid bid_size offer offer_size
timestamp
2016-01-04 00:00:31.331441920 0.7284 4000000 0.7285 1000000
2016-01-04 00:00:53.631324928 0.7284 4000000 0.7290 4000000
2016-01-04 00:01:03.131234048 0.7284 5000000 0.7286 4000000
2016-01-04 00:01:12.131444992 0.7285 1000000 0.7286 4000000
2016-01-04 00:01:15.631364096 0.7285 4000000 0.7290 4000000
With
data = pd.concat([df1,df2], axis=1)
But I get the follwing output:
InvalidIndexError Traceback (most recent call last)
<ipython-input-38-2e88458f01d7> in <module>()
----> 1 data = pd.concat([df1,df2], axis=1)
2 data = data.fillna(method='pad')
3 data = data.fillna(method='bfill')
4 data['timestamp'] = data.index.values#converting to datetime
5 data['timestamp'] = pd.to_datetime(data['timestamp'])#converting to datetime
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
810 keys=keys, levels=levels, names=names,
811 verify_integrity=verify_integrity,
--> 812 copy=copy)
813 return op.get_result()
814
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
947 self.copy = copy
948
--> 949 self.new_axes = self._get_new_axes()
950
951 def get_result(self):
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
1013 if i == self.axis:
1014 continue
-> 1015 new_axes[i] = self._get_comb_axis(i)
1016 else:
1017 if len(self.join_axes) != ndim - 1:
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
1039 raise TypeError("Cannot concatenate list of %s" % types)
1040
-> 1041 return _get_combined_index(all_indexes, intersect=self.intersect)
1042
1043 def _get_concat_axis(self):
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
6120 index = index.intersection(other)
6121 return index
-> 6122 union = _union_indexes(indexes)
6123 return _ensure_index(union)
6124
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
6149
6150 if hasattr(result, 'union_many'):
-> 6151 return result.union_many(indexes[1:])
6152 else:
6153 for other in indexes[1:]:
/usr/local/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
959 else:
960 tz = this.tz
--> 961 this = Index.union(this, other)
962 if isinstance(this, DatetimeIndex):
963 this.tz = tz
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1553 result.extend([x for x in other._values if x not in value_set])
1554 else:
-> 1555 indexer = self.get_indexer(other)
1556 indexer, = (indexer == -1).nonzero()
1557
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit, tolerance)
1890
1891 if not self.is_unique:
-> 1892 raise InvalidIndexError('Reindexing only valid with uniquely'
1893 ' valued Index objects')
1894
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I have removed additional columns and removed duplicates and NA where there could be a conflict - but I simply do not know whats wrong.
Please help
Thanks
python numpy pandas
what doespd.concat
do?
– gmoshkin
Jan 29 '16 at 12:05
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I do not understand what neitherpd
nordf1
anddf2
are. so I have no Idea neither what you are trying to do nor what goes wrong.
– gmoshkin
Jan 29 '16 at 14:36
1
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30
add a comment |
I am trying to concat the following:
df1
price side timestamp
timestamp
2016-01-04 00:01:15.631331072 0.7286 2 1451865675631331
2016-01-04 00:01:15.631399936 0.7286 2 1451865675631400
2016-01-04 00:01:15.631860992 0.7286 2 1451865675631861
2016-01-04 00:01:15.631866112 0.7286 2 1451865675631866
and
df2
bid bid_size offer offer_size
timestamp
2016-01-04 00:00:31.331441920 0.7284 4000000 0.7285 1000000
2016-01-04 00:00:53.631324928 0.7284 4000000 0.7290 4000000
2016-01-04 00:01:03.131234048 0.7284 5000000 0.7286 4000000
2016-01-04 00:01:12.131444992 0.7285 1000000 0.7286 4000000
2016-01-04 00:01:15.631364096 0.7285 4000000 0.7290 4000000
With
data = pd.concat([df1,df2], axis=1)
But I get the follwing output:
InvalidIndexError Traceback (most recent call last)
<ipython-input-38-2e88458f01d7> in <module>()
----> 1 data = pd.concat([df1,df2], axis=1)
2 data = data.fillna(method='pad')
3 data = data.fillna(method='bfill')
4 data['timestamp'] = data.index.values#converting to datetime
5 data['timestamp'] = pd.to_datetime(data['timestamp'])#converting to datetime
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
810 keys=keys, levels=levels, names=names,
811 verify_integrity=verify_integrity,
--> 812 copy=copy)
813 return op.get_result()
814
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
947 self.copy = copy
948
--> 949 self.new_axes = self._get_new_axes()
950
951 def get_result(self):
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
1013 if i == self.axis:
1014 continue
-> 1015 new_axes[i] = self._get_comb_axis(i)
1016 else:
1017 if len(self.join_axes) != ndim - 1:
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
1039 raise TypeError("Cannot concatenate list of %s" % types)
1040
-> 1041 return _get_combined_index(all_indexes, intersect=self.intersect)
1042
1043 def _get_concat_axis(self):
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
6120 index = index.intersection(other)
6121 return index
-> 6122 union = _union_indexes(indexes)
6123 return _ensure_index(union)
6124
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
6149
6150 if hasattr(result, 'union_many'):
-> 6151 return result.union_many(indexes[1:])
6152 else:
6153 for other in indexes[1:]:
/usr/local/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
959 else:
960 tz = this.tz
--> 961 this = Index.union(this, other)
962 if isinstance(this, DatetimeIndex):
963 this.tz = tz
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1553 result.extend([x for x in other._values if x not in value_set])
1554 else:
-> 1555 indexer = self.get_indexer(other)
1556 indexer, = (indexer == -1).nonzero()
1557
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit, tolerance)
1890
1891 if not self.is_unique:
-> 1892 raise InvalidIndexError('Reindexing only valid with uniquely'
1893 ' valued Index objects')
1894
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I have removed additional columns and removed duplicates and NA where there could be a conflict - but I simply do not know whats wrong.
Please help
Thanks
python numpy pandas
I am trying to concat the following:
df1
price side timestamp
timestamp
2016-01-04 00:01:15.631331072 0.7286 2 1451865675631331
2016-01-04 00:01:15.631399936 0.7286 2 1451865675631400
2016-01-04 00:01:15.631860992 0.7286 2 1451865675631861
2016-01-04 00:01:15.631866112 0.7286 2 1451865675631866
and
df2
bid bid_size offer offer_size
timestamp
2016-01-04 00:00:31.331441920 0.7284 4000000 0.7285 1000000
2016-01-04 00:00:53.631324928 0.7284 4000000 0.7290 4000000
2016-01-04 00:01:03.131234048 0.7284 5000000 0.7286 4000000
2016-01-04 00:01:12.131444992 0.7285 1000000 0.7286 4000000
2016-01-04 00:01:15.631364096 0.7285 4000000 0.7290 4000000
With
data = pd.concat([df1,df2], axis=1)
But I get the follwing output:
InvalidIndexError Traceback (most recent call last)
<ipython-input-38-2e88458f01d7> in <module>()
----> 1 data = pd.concat([df1,df2], axis=1)
2 data = data.fillna(method='pad')
3 data = data.fillna(method='bfill')
4 data['timestamp'] = data.index.values#converting to datetime
5 data['timestamp'] = pd.to_datetime(data['timestamp'])#converting to datetime
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
810 keys=keys, levels=levels, names=names,
811 verify_integrity=verify_integrity,
--> 812 copy=copy)
813 return op.get_result()
814
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
947 self.copy = copy
948
--> 949 self.new_axes = self._get_new_axes()
950
951 def get_result(self):
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
1013 if i == self.axis:
1014 continue
-> 1015 new_axes[i] = self._get_comb_axis(i)
1016 else:
1017 if len(self.join_axes) != ndim - 1:
/usr/local/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
1039 raise TypeError("Cannot concatenate list of %s" % types)
1040
-> 1041 return _get_combined_index(all_indexes, intersect=self.intersect)
1042
1043 def _get_concat_axis(self):
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
6120 index = index.intersection(other)
6121 return index
-> 6122 union = _union_indexes(indexes)
6123 return _ensure_index(union)
6124
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
6149
6150 if hasattr(result, 'union_many'):
-> 6151 return result.union_many(indexes[1:])
6152 else:
6153 for other in indexes[1:]:
/usr/local/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
959 else:
960 tz = this.tz
--> 961 this = Index.union(this, other)
962 if isinstance(this, DatetimeIndex):
963 this.tz = tz
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1553 result.extend([x for x in other._values if x not in value_set])
1554 else:
-> 1555 indexer = self.get_indexer(other)
1556 indexer, = (indexer == -1).nonzero()
1557
/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit, tolerance)
1890
1891 if not self.is_unique:
-> 1892 raise InvalidIndexError('Reindexing only valid with uniquely'
1893 ' valued Index objects')
1894
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I have removed additional columns and removed duplicates and NA where there could be a conflict - but I simply do not know whats wrong.
Please help
Thanks
python numpy pandas
python numpy pandas
edited Jan 29 '16 at 13:56
noidea
asked Jan 29 '16 at 12:00
noideanoidea
89227
89227
what doespd.concat
do?
– gmoshkin
Jan 29 '16 at 12:05
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I do not understand what neitherpd
nordf1
anddf2
are. so I have no Idea neither what you are trying to do nor what goes wrong.
– gmoshkin
Jan 29 '16 at 14:36
1
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30
add a comment |
what doespd.concat
do?
– gmoshkin
Jan 29 '16 at 12:05
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I do not understand what neitherpd
nordf1
anddf2
are. so I have no Idea neither what you are trying to do nor what goes wrong.
– gmoshkin
Jan 29 '16 at 14:36
1
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30
what does
pd.concat
do?– gmoshkin
Jan 29 '16 at 12:05
what does
pd.concat
do?– gmoshkin
Jan 29 '16 at 12:05
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I do not understand what neither
pd
nor df1
and df2
are. so I have no Idea neither what you are trying to do nor what goes wrong.– gmoshkin
Jan 29 '16 at 14:36
I do not understand what neither
pd
nor df1
and df2
are. so I have no Idea neither what you are trying to do nor what goes wrong.– gmoshkin
Jan 29 '16 at 14:36
1
1
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30
add a comment |
3 Answers
3
active
oldest
votes
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
df = df.loc[~df.index.duplicated(keep='first')]
import pandas as pd
from pandas import Timestamp
df1 = pd.DataFrame(
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
'side': [2, 2, 2, 2],
'timestamp': [1451865675631331, 1451865675631400,
1451865675631861, 1451865675631866]},
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
df2 = pd.DataFrame(
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
df1 = df1.loc[~df1.index.duplicated(keep='first')]
# price side timestamp
# 2000-01-01 0.7286 2 1451865675631331
# 2001-01-01 0.7286 2 1451865675631861
# 2002-01-01 0.7286 2 1451865675631866
df2 = df2.loc[~df2.index.duplicated(keep='first')]
# bid bid_size offer offer_size
# 2000-01-01 0.7284 4000000 0.7285 1000000
# 2001-01-01 0.7284 4000000 0.7290 4000000
# 2002-01-01 0.7284 5000000 0.7286 4000000
# 2003-01-01 0.7285 1000000 0.7286 4000000
# 2004-01-01 0.7285 4000000 0.7290 4000000
result = pd.concat([df1, df2], axis=0)
print(result)
bid bid_size offer offer_size price side timestamp
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
In [94]: df1.join(df2)
Out[94]:
price side timestamp bid bid_size offer
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
offer_size
2000-01-01 1000000
2000-01-01 1000000
2001-01-01 4000000
2002-01-01 4000000
In [95]: df1.join(df2, how='outer')
Out[95]:
price side timestamp bid bid_size offer offer_size
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
add a comment |
You can mitigate this error without having to change your data or remove duplicates. Just create a new index with DataFrame.reset_index:
df = df.reset_index()
The old index is kept as a column in your dataframe, but if you don't need it you can do:
df = df.reset_index(drop=True)
Some prefer:
df.reset_index(inplace=True, drop=True)
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
add a comment |
Another thing that might throw this type of errors is when you have a column with a unique value inside (entropy of 0).
In this case, you can either inject small Gaussian noise in that column or remove it completely.
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f35084071%2fconcat-dataframe-reindexing-only-valid-with-uniquely-valued-index-objects%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
3 Answers
3
active
oldest
votes
3 Answers
3
active
oldest
votes
active
oldest
votes
active
oldest
votes
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
df = df.loc[~df.index.duplicated(keep='first')]
import pandas as pd
from pandas import Timestamp
df1 = pd.DataFrame(
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
'side': [2, 2, 2, 2],
'timestamp': [1451865675631331, 1451865675631400,
1451865675631861, 1451865675631866]},
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
df2 = pd.DataFrame(
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
df1 = df1.loc[~df1.index.duplicated(keep='first')]
# price side timestamp
# 2000-01-01 0.7286 2 1451865675631331
# 2001-01-01 0.7286 2 1451865675631861
# 2002-01-01 0.7286 2 1451865675631866
df2 = df2.loc[~df2.index.duplicated(keep='first')]
# bid bid_size offer offer_size
# 2000-01-01 0.7284 4000000 0.7285 1000000
# 2001-01-01 0.7284 4000000 0.7290 4000000
# 2002-01-01 0.7284 5000000 0.7286 4000000
# 2003-01-01 0.7285 1000000 0.7286 4000000
# 2004-01-01 0.7285 4000000 0.7290 4000000
result = pd.concat([df1, df2], axis=0)
print(result)
bid bid_size offer offer_size price side timestamp
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
In [94]: df1.join(df2)
Out[94]:
price side timestamp bid bid_size offer
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
offer_size
2000-01-01 1000000
2000-01-01 1000000
2001-01-01 4000000
2002-01-01 4000000
In [95]: df1.join(df2, how='outer')
Out[95]:
price side timestamp bid bid_size offer offer_size
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
add a comment |
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
df = df.loc[~df.index.duplicated(keep='first')]
import pandas as pd
from pandas import Timestamp
df1 = pd.DataFrame(
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
'side': [2, 2, 2, 2],
'timestamp': [1451865675631331, 1451865675631400,
1451865675631861, 1451865675631866]},
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
df2 = pd.DataFrame(
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
df1 = df1.loc[~df1.index.duplicated(keep='first')]
# price side timestamp
# 2000-01-01 0.7286 2 1451865675631331
# 2001-01-01 0.7286 2 1451865675631861
# 2002-01-01 0.7286 2 1451865675631866
df2 = df2.loc[~df2.index.duplicated(keep='first')]
# bid bid_size offer offer_size
# 2000-01-01 0.7284 4000000 0.7285 1000000
# 2001-01-01 0.7284 4000000 0.7290 4000000
# 2002-01-01 0.7284 5000000 0.7286 4000000
# 2003-01-01 0.7285 1000000 0.7286 4000000
# 2004-01-01 0.7285 4000000 0.7290 4000000
result = pd.concat([df1, df2], axis=0)
print(result)
bid bid_size offer offer_size price side timestamp
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
In [94]: df1.join(df2)
Out[94]:
price side timestamp bid bid_size offer
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
offer_size
2000-01-01 1000000
2000-01-01 1000000
2001-01-01 4000000
2002-01-01 4000000
In [95]: df1.join(df2, how='outer')
Out[95]:
price side timestamp bid bid_size offer offer_size
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
add a comment |
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
df = df.loc[~df.index.duplicated(keep='first')]
import pandas as pd
from pandas import Timestamp
df1 = pd.DataFrame(
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
'side': [2, 2, 2, 2],
'timestamp': [1451865675631331, 1451865675631400,
1451865675631861, 1451865675631866]},
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
df2 = pd.DataFrame(
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
df1 = df1.loc[~df1.index.duplicated(keep='first')]
# price side timestamp
# 2000-01-01 0.7286 2 1451865675631331
# 2001-01-01 0.7286 2 1451865675631861
# 2002-01-01 0.7286 2 1451865675631866
df2 = df2.loc[~df2.index.duplicated(keep='first')]
# bid bid_size offer offer_size
# 2000-01-01 0.7284 4000000 0.7285 1000000
# 2001-01-01 0.7284 4000000 0.7290 4000000
# 2002-01-01 0.7284 5000000 0.7286 4000000
# 2003-01-01 0.7285 1000000 0.7286 4000000
# 2004-01-01 0.7285 4000000 0.7290 4000000
result = pd.concat([df1, df2], axis=0)
print(result)
bid bid_size offer offer_size price side timestamp
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
In [94]: df1.join(df2)
Out[94]:
price side timestamp bid bid_size offer
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
offer_size
2000-01-01 1000000
2000-01-01 1000000
2001-01-01 4000000
2002-01-01 4000000
In [95]: df1.join(df2, how='outer')
Out[95]:
price side timestamp bid bid_size offer offer_size
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
pd.concat
requires that the indices be unique. To remove rows with duplicate indices, use
df = df.loc[~df.index.duplicated(keep='first')]
import pandas as pd
from pandas import Timestamp
df1 = pd.DataFrame(
{'price': [0.7286, 0.7286, 0.7286, 0.7286],
'side': [2, 2, 2, 2],
'timestamp': [1451865675631331, 1451865675631400,
1451865675631861, 1451865675631866]},
index=pd.DatetimeIndex(['2000-1-1', '2000-1-1', '2001-1-1', '2002-1-1']))
df2 = pd.DataFrame(
{'bid': [0.7284, 0.7284, 0.7284, 0.7285, 0.7285],
'bid_size': [4000000, 4000000, 5000000, 1000000, 4000000],
'offer': [0.7285, 0.729, 0.7286, 0.7286, 0.729],
'offer_size': [1000000, 4000000, 4000000, 4000000, 4000000]},
index=pd.DatetimeIndex(['2000-1-1', '2001-1-1', '2002-1-1', '2003-1-1', '2004-1-1']))
df1 = df1.loc[~df1.index.duplicated(keep='first')]
# price side timestamp
# 2000-01-01 0.7286 2 1451865675631331
# 2001-01-01 0.7286 2 1451865675631861
# 2002-01-01 0.7286 2 1451865675631866
df2 = df2.loc[~df2.index.duplicated(keep='first')]
# bid bid_size offer offer_size
# 2000-01-01 0.7284 4000000 0.7285 1000000
# 2001-01-01 0.7284 4000000 0.7290 4000000
# 2002-01-01 0.7284 5000000 0.7286 4000000
# 2003-01-01 0.7285 1000000 0.7286 4000000
# 2004-01-01 0.7285 4000000 0.7290 4000000
result = pd.concat([df1, df2], axis=0)
print(result)
bid bid_size offer offer_size price side timestamp
2000-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2001-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2002-01-01 NaN NaN NaN NaN 0.7286 2 1.451866e+15
2000-01-01 0.7284 4000000 0.7285 1000000 NaN NaN NaN
2001-01-01 0.7284 4000000 0.7290 4000000 NaN NaN NaN
2002-01-01 0.7284 5000000 0.7286 4000000 NaN NaN NaN
2003-01-01 0.7285 1000000 0.7286 4000000 NaN NaN NaN
2004-01-01 0.7285 4000000 0.7290 4000000 NaN NaN NaN
Note there is also pd.join
, which can join DataFrames based on their indices,
and handle non-unique indices based on the how
parameter. Rows with duplicate
index are not removed.
In [94]: df1.join(df2)
Out[94]:
price side timestamp bid bid_size offer
2000-01-01 0.7286 2 1451865675631331 0.7284 4000000 0.7285
2000-01-01 0.7286 2 1451865675631400 0.7284 4000000 0.7285
2001-01-01 0.7286 2 1451865675631861 0.7284 4000000 0.7290
2002-01-01 0.7286 2 1451865675631866 0.7284 5000000 0.7286
offer_size
2000-01-01 1000000
2000-01-01 1000000
2001-01-01 4000000
2002-01-01 4000000
In [95]: df1.join(df2, how='outer')
Out[95]:
price side timestamp bid bid_size offer offer_size
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2000-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7285 1000000
2001-01-01 0.7286 2 1.451866e+15 0.7284 4000000 0.7290 4000000
2002-01-01 0.7286 2 1.451866e+15 0.7284 5000000 0.7286 4000000
2003-01-01 NaN NaN NaN 0.7285 1000000 0.7286 4000000
2004-01-01 NaN NaN NaN 0.7285 4000000 0.7290 4000000
edited May 23 '17 at 12:34
Community♦
11
11
answered Jan 29 '16 at 15:26
unutbuunutbu
550k10111821239
550k10111821239
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
add a comment |
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
@untubu - works a treat with join Thanks
– noidea
Jan 29 '16 at 17:30
add a comment |
You can mitigate this error without having to change your data or remove duplicates. Just create a new index with DataFrame.reset_index:
df = df.reset_index()
The old index is kept as a column in your dataframe, but if you don't need it you can do:
df = df.reset_index(drop=True)
Some prefer:
df.reset_index(inplace=True, drop=True)
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
add a comment |
You can mitigate this error without having to change your data or remove duplicates. Just create a new index with DataFrame.reset_index:
df = df.reset_index()
The old index is kept as a column in your dataframe, but if you don't need it you can do:
df = df.reset_index(drop=True)
Some prefer:
df.reset_index(inplace=True, drop=True)
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
add a comment |
You can mitigate this error without having to change your data or remove duplicates. Just create a new index with DataFrame.reset_index:
df = df.reset_index()
The old index is kept as a column in your dataframe, but if you don't need it you can do:
df = df.reset_index(drop=True)
Some prefer:
df.reset_index(inplace=True, drop=True)
You can mitigate this error without having to change your data or remove duplicates. Just create a new index with DataFrame.reset_index:
df = df.reset_index()
The old index is kept as a column in your dataframe, but if you don't need it you can do:
df = df.reset_index(drop=True)
Some prefer:
df.reset_index(inplace=True, drop=True)
edited Jul 12 '17 at 11:23
answered Jul 12 '17 at 11:16
Nicholas MorleyNicholas Morley
1,316279
1,316279
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
add a comment |
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
1
1
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
This should be marked asd the right answer, since it solves the problem without loosing information. Not always duplicate records are wrong on the datasets.
– Juanu Haedo
Nov 24 '17 at 19:29
add a comment |
Another thing that might throw this type of errors is when you have a column with a unique value inside (entropy of 0).
In this case, you can either inject small Gaussian noise in that column or remove it completely.
add a comment |
Another thing that might throw this type of errors is when you have a column with a unique value inside (entropy of 0).
In this case, you can either inject small Gaussian noise in that column or remove it completely.
add a comment |
Another thing that might throw this type of errors is when you have a column with a unique value inside (entropy of 0).
In this case, you can either inject small Gaussian noise in that column or remove it completely.
Another thing that might throw this type of errors is when you have a column with a unique value inside (entropy of 0).
In this case, you can either inject small Gaussian noise in that column or remove it completely.
answered Nov 21 '18 at 10:31
May Pilijay ElMay Pilijay El
113
113
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f35084071%2fconcat-dataframe-reindexing-only-valid-with-uniquely-valued-index-objects%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
what does
pd.concat
do?– gmoshkin
Jan 29 '16 at 12:05
@gmoshkin it places dataframes together as one dataframe - joined on the axis.
– noidea
Jan 29 '16 at 12:29
I have removed the column timestamp from both df1 and df2 and attempted to drop and NA with df1.dropna() and df2.dropna(); The problem persists....
– noidea
Jan 29 '16 at 12:49
I do not understand what neither
pd
nordf1
anddf2
are. so I have no Idea neither what you are trying to do nor what goes wrong.– gmoshkin
Jan 29 '16 at 14:36
1
@gmoshkin, I am assuming that pd as an alias for pandas: import pandas as pd, and df1 and df2 are pandas DataFrame objects
– jugovich
Nov 16 '17 at 2:30