抽取trigram特征代码如下:
def trigramsFeature(query: String) = {
query.split("")
.sliding(3)
.filter{ case Array(a,b,c) => (a+b+c).size == 3 }
.map{ case Array(a,b,c) => ((a,b,c), 1) }
}抽取bigram特征并且hash的代码:
def bigramsFeature(query: String) = {
query.split("。").
map(_.split("").
filter(_.nonEmpty).
sliding(2).
filter(_.size==2).map{case Array(a,b) => ((a+b).hashCode, 1.0)}).
flatMap(x => x)
}