我写了一个快速的功能在谷歌新闻预训练模型加载到火花word2vec模型。请享用。
def loadBin(file: String) = {
def readUntil(inputStream: DataInputStream, term: Char, maxLength: Int = 1024 * 8): String = {
var char: Char = inputStream.readByte().toChar
val str = new StringBuilder
while (!char.equals(term)) {
str.append(char)
assert(str.size < maxLength)
char = inputStream.readByte().toChar
}
str.toString
}
val inputStream: DataInputStream = new DataInputStream(new GZIPInputStream(new FileInputStream(file)))
try {
val header = readUntil(inputStream, '\n')
val (records, dimensions) = header.split(" ") match {
case Array(records, dimensions) => (records.toInt, dimensions.toInt)
}
new Word2VecModel((0 until records).toArray.map(recordIndex => {
readUntil(inputStream, ' ') -> (0 until dimensions).map(dimensionIndex => {
java.lang.Float.intBitsToFloat(java.lang.Integer.reverseBytes(inputStream.readInt()))
}).toArray
}).toMap)
} finally {
inputStream.close()
}
}
将bin转换为文本文件后应该如何加载模型? – LonsomeHell