'''
Sparse Matrix
'''
import
struct
import
numpy as np
import
bsddb
from
cStringIO
import
StringIO
class
DictMatrix():
def
__init__(
self
, container
=
{}, dft
=
0.0
):
self
._data
=
container
self
._dft
=
dft
self
._nums
=
0
def
__setitem__(
self
, index, value):
try
:
i, j
=
index
except
:
raise
IndexError(
'invalid index'
)
ik
=
(
'i%d'
%
i)
# 为了节省内存,我们把j, value打包成字二进制字符串
ib
=
struct.pack(
'if'
, j, value)
jk
=
(
'j%d'
%
j)
jb
=
struct.pack(
'if'
, i, value)
try
:
self
._data[ik]
+
=
ib
except
:
self
._data[ik]
=
ib
try
:
self
._data[jk]
+
=
jb
except
:
self
._data[jk]
=
jb
self
._nums
+
=
1
def
__getitem__(
self
, index):
try
:
i, j
=
index
except
:
raise
IndexError(
'invalid index'
)
if
(
isinstance
(i,
int
)):
ik
=
(
'i%d'
%
i)
if
not
self
._data.has_key(ik):
return
self
._dft
ret
=
dict
(np.fromstring(
self
._data[ik], dtype
=
'i4,f4'
))
if
(
isinstance
(j,
int
)):
return
ret.get(j,
self
._dft)
if
(
isinstance
(j,
int
)):
jk
=
(
'j%d'
%
j)
if
not
self
._data.has_key(jk):
return
self
._dft
ret
=
dict
(np.fromstring(
self
._data[jk], dtype
=
'i4,f4'
))
return
ret
def
__len__(
self
):
return
self
._nums
def
__iter__(
self
):
pass
'''
从文件中生成matrix
考虑到dbm读写的性能不如内存,我们做了一些缓存,每1000W次批量写入一次
考虑到字符串拼接性能不太好,我们直接用StringIO来做拼接
'''
def
from_file(
self
, fp, sep
=
't'
):
cnt
=
0
cache
=
{}
for
l
in
fp:
if
10000000
=
=
cnt:
self
._flush(cache)
cnt
=
0
cache
=
{}
i, j, v
=
[
float
(i)
for
i
in
l.split(sep)]
ik
=
(
'i%d'
%
i)
ib
=
struct.pack(
'if'
, j, v)
jk
=
(
'j%d'
%
j)
jb
=
struct.pack(
'if'
, i, v)
try
:
cache[ik].write(ib)
except
:
cache[ik]
=
StringIO()
cache[ik].write(ib)
try
:
cache[jk].write(jb)
except
:
cache[jk]
=
StringIO()
cache[jk].write(jb)
cnt
+
=
1
self
._nums
+
=
1
self
._flush(cache)
return
self
._nums
def
_flush(
self
, cache):
for
k,v
in
cache.items():
v.seek(
0
)
s
=
v.read()
try
:
self
._data[k]
+
=
s
except
:
self
._data[k]
=
s
if
__name__
=
=
'__main__'
:
db
=
bsddb.btopen(
None
, cachesize
=
268435456
)
data
=
DictMatrix(db)
data.from_file(
open
(
'/path/to/log.txt'
,
'r'
),
','
)