2012-08-14 32 views
6

我试图从this dump创建一个LinkedGeoData.org的本地镜像。加载rdf三元组到开放源代码

大约有61,000,000个三元组。 Virtuoso应该可以轻松处理更多的事情,但每过一次它就会停止加载大约40,000,000个三元组。我使用Amazon EC2中的一个超大型实例,它具有30 GB的RAM,并且还有大量存储空间。我的配置文件有问题吗?我使用的是Ubuntu 12.04服务器,我试过通过apt-get(版本6.1.5)安装Virtuoso,然后在Jörn Hees' instructions之后从github(版本6.1.6)的最新稳定版本中编译。

我也尝试将转储文件拆分成小块并逐一加载它们。插入大约40,000,000个三元组后,这也会崩溃。

该日志文件不显示任何奇怪的; virtuoso-t只是停止工作而没有实际崩溃,top显示使用0%CPU的进程。在上半个小时左右之后,我已经让这个过程持续了好几天,没有任何进展。

这里是我的virtuoso.ini文件:

[Database] 
DatabaseFile   = /var/lib/virtuoso/db/virtuoso.db 
ErrorLogFile   = /var/lib/virtuoso/db/virtuoso.log 
LockFile   = /var/lib/virtuoso/db/virtuoso.lck 
TransactionFile   = /var/lib/virtuoso/db/virtuoso.trx 
xa_persistent_file  = /var/lib/virtuoso/db/virtuoso.pxa 
ErrorLogLevel   = 7 
FileExtend   = 200 
MaxCheckpointRemap  = 625000 
Striping   = 0 
TempStorage   = TempDatabase 


[TempDatabase] 
DatabaseFile   = /var/lib/virtuoso/db/virtuoso-temp.db 
TransactionFile   = /var/lib/virtuoso/db/virtuoso-temp.trx 
MaxCheckpointRemap  = 2000 
Striping   = 0 


; 
; Server parameters 
; 
[Parameters] 
ServerPort   = 1111 
LiteMode   = 0 
DisableUnixSocket  = 1 
DisableTcpSocket  = 0 
;SSLServerPort   = 2111 
;SSLCertificate   = cert.pem 
;SSLPrivateKey   = pk.pem 
;X509ClientVerify  = 0 
;X509ClientVerifyDepth  = 0 
;X509ClientVerifyCAFile  = ca.pem 
ServerThreads   = 20 
CheckpointInterval  = 60 
O_DIRECT   = 0 
CaseMode   = 2 
MaxStaticCursorRows  = 5000 
CheckpointAuditTrail  = 0 
AllowOSCalls   = 0 
SchedulerInterval  = 10 
DirsAllowed   = ., /usr/share/virtuoso/vad, /home/ubuntu/lgd 
ThreadCleanupInterval  = 0 
ThreadThreshold   = 10 
ResourcesCleanupInterval = 0 
FreeTextBatchSize  = 100000 
SingleCPU   = 0 
VADInstallDir   = /usr/share/virtuoso/vad/ 
PrefixResultNames    = 0 
RdfFreeTextRulesSize  = 100 
IndexTreeMaps   = 256 
MaxMemPoolSize     = 200000000 
PrefixResultNames    = 0 
MacSpotlight     = 0 
IndexTreeMaps     = 64 
;; 
;; When running with large data sets, one should configure the Virtuoso 
;; process to use between 2/3 to 3/5 of free system memory and to stripe 
;; storage on all available disks. 
;; 
;; Uncomment next two lines if there is 2 GB system memory free 
;  NumberOfBuffers   = 170000 
;  MaxDirtyBuffers   = 130000 
;; Uncomment next two lines if there is 4 GB system memory free 
;  NumberOfBuffers   = 340000 
;  MaxDirtyBuffers   = 250000 
;; Uncomment next two lines if there is 8 GB system memory free 
;  NumberOfBuffers   = 680000 
;  MaxDirtyBuffers   = 500000 
;; Uncomment next two lines if there is 16 GB system memory free 
;  NumberOfBuffers   = 1360000 
;  MaxDirtyBuffers   = 1000000 
;; Uncomment next two lines if there is 32 GB system memory free 
     NumberOfBuffers   = 2720000 
     MaxDirtyBuffers   = 2000000 
;; Uncomment next two lines if there is 48 GB system memory free 
;  NumberOfBuffers   = 4000000 
;  MaxDirtyBuffers   = 3000000 
;; Uncomment next two lines if there is 64 GB system memory free 
;  NumberOfBuffers   = 5450000 
;  MaxDirtyBuffers   = 4000000 
;; 
;; Note the default settings will take very little memory 
;; but will not result in very good performance 
;; 


[HTTPServer] 
ServerPort   = 8890 
ServerRoot   = /var/lib/virtuoso/vsp 
ServerThreads   = 20 
DavRoot    = DAV 
EnabledDavVSP   = 0 
HTTPProxyEnabled  = 0 
TempASPXDir   = 0 
DefaultMailServer  = localhost:25 
ServerThreads   = 10 
MaxKeepAlives   = 10 
KeepAliveTimeout  = 10 
MaxCachedProxyConnections = 10 
ProxyConnectionCacheTimeout = 15 
HTTPThreadSize   = 280000 
HttpPrintWarningsInOutput = 0 
Charset    = UTF-8 
;HTTPLogFile    = logs/http.log 

[AutoRepair] 
BadParentLinks   = 0 

[Client] 
SQL_PREFETCH_ROWS  = 100 
SQL_PREFETCH_BYTES  = 16000 
SQL_QUERY_TIMEOUT  = 0 
SQL_TXN_TIMEOUT   = 0 
;SQL_NO_CHAR_C_ESCAPE  = 1 
;SQL_UTF8_EXECS   = 0 
;SQL_NO_SYSTEM_TABLES  = 0 
;SQL_BINARY_TIMESTAMP  = 1 
;SQL_ENCRYPTION_ON_PASSWORD = -1 

[VDB] 
ArrayOptimization  = 0 
NumArrayParameters  = 10 
VDBDisconnectTimeout  = 1000 
KeepConnectionOnFixedThread = 0 

[Replication] 
ServerName   = db-IP-10-252-61-61 
ServerEnable   = 1 
QueueMax   = 50000 


; 
; Striping setup 
; 
; These parameters have only effect when Striping is set to 1 in the 
; [Database] section, in which case the DatabaseFile parameter is ignored. 
; 
; With striping, the database is spawned across multiple segments 
; where each segment can have multiple stripes. 
; 
; Format of the lines below: 
; Segment<number> = <size>, <stripe file name> [, <stripe file name> .. ] 
; 
; <number> must be ordered from 1 up. 
; 
; The <size> is the total size of the segment which is equally divided 
; across all stripes forming the segment. Its specification can be in 
; gigabytes (g), megabytes (m), kilobytes (k) or in database blocks 
; (b, the default) 
; 
; Note that the segment size must be a multiple of the database page size 
; which is currently 8k. Also, the segment size must be divisible by the 
; number of stripe files forming the segment. 
; 
; The example below creates a 200 meg database striped on two segments 
; with two stripes of 50 meg and one of 100 meg. 
; 
; You can always add more segments to the configuration, but once 
; added, do not change the setup. 
; 
[Striping] 
Segment1   = 100M, db-seg1-1.db, db-seg1-2.db 
Segment2   = 100M, db-seg2-1.db 
;... 

;[TempStriping] 
;Segment1   = 100M, db-seg1-1.db, db-seg1-2.db 
;Segment2   = 100M, db-seg2-1.db 
;... 

;[Ucms] 
;UcmPath   = <path> 
;Ucm1    = <file> 
;Ucm2    = <file> 
;... 


[Zero Config] 
ServerName   = virtuoso (IP-10-252-61-61) 
;ServerDSN   = ZDSN 
;SSLServerName   = 
;SSLServerDSN   = 


[Mono] 
;MONO_TRACE   = Off 
;MONO_PATH   = <path_here> 
;MONO_ROOT   = <path_here> 
;MONO_CFG_DIR   = <path_here> 
;virtclr.dll   = 


[URIQA] 
DynamicLocal   = 0 
DefaultHost   = localhost:8890 


[SPARQL] 
;ExternalQuerySource  = 1 
;ExternalXsltSource   = 1 
;DefaultGraph   = http://localhost:8890/dataspace 
;ImmutableGraphs   = http://localhost:8890/dataspace 
ResultSetMaxRows   = 10000 
MaxQueryCostEstimationTime = 4000 ; in seconds 
MaxQueryExecutionTime  = 600 ; in seconds 
DefaultQuery    = select distinct ?Concept where {[] a ?Concept} LIMIT 100 
DeferInferenceRulesInit  = 0 ; controls inference rules loading 
;PingService   = http://rpc.pingthesemanticweb.com/ 
ShortenLongURIs = 1 

[Plugins] 
LoadPath   = /usr/lib/virtuoso/hosting 
Load1    = plain, wikiv 
Load2    = plain, mediawiki 
Load3    = plain, creolewiki 
Load4   = plain, im 

任何帮助是极大的赞赏。

+0

为了未来读者的利益...Jörn几次更新了他的指南。 [最新日期为2015-11-23,基于Virtuoso 7.2.1和DBpedia 2015](https://joernhees.de/blog/2015/11/23/setting-up-a-linked-data-mirror -from-RDF-转储-DBpedia中-2015-04-游离碱-维基数据-linkedgeodata与 - virtuso-7-2-1和 - 搬运工-可选/)。 – TallTed 2015-12-23 01:56:58

+0

另请注意,特定于Virtuoso的问题经常通过特定于产品的资源(如[Virtuoso用户邮件列表])(https://lists.sourceforge.net/lists/listinfo/virtuoso-users/),公众[OpenLink支持论坛](http://boards.openlinksw.com/support/index.php)或[机密OpenLink支持案例](http://support.openlinksw.com/support/online-support.vsp) 。 ObDisclaimer:我为[OpenLink Software](http://www.openlinksw.com/)工作,[Virtuoso](http://virtuoso.openlinksw.com/)的制作人。 – TallTed 2015-12-23 01:59:10

回答

4

回答我自己的问题。问题是前导空格中的台词

NumberOfBuffers   = 2720000 
    MaxDirtyBuffers   = 2000000 

删除那些炫技实际使用的可用内存而不是默认的16MB。