博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java+lucene中文分词,搜索引擎搜词剖析
阅读量:7182 次
发布时间:2019-06-29

本文共 21542 字,大约阅读时间需要 71 分钟。

我想只要是学过数据库的孩纸,不管是mysql,还是sqlsever,一提到查找,本能的想到的便是like关键字,其实去(分类模式)之前也是采用这种算法,但我可以告诉大家一个很不幸的事情,like匹配其实会浪费大量的有用资源,原因这里不说了请自己想一想,我们还是直接摆事实验证。

现在用去转盘网搜:hello 找个单词,如下:

翻页你会发现只要是包含hello的单词都找到了,但是如果你用like的话是不会有这个效果的,不信让我们再看一下,还好的分词算法我还没来得及修改,还可以看到现象:

你会发现只有开始包含hello这个字段的搜索串才能得到匹配,这就问题来了,数据库中大量的资源岂不是白白浪费了,不过没事,伟大的人类还是很聪明的,发明了分词,分词的原理我就不讲了,请自己百度吧,还是直接上代码,提示,这里需要四个jar包作为工具,我先上传的去转盘,想要做分词的请先下载:

直接看代码:

package com.tray.indexData;import java.io.File;import java.io.IOException;import java.io.StringReader;import java.math.BigInteger;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map; import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Fieldable;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.PrefixQuery;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.TopScoreDocCollector;import org.apache.lucene.search.WildcardQuery;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer; import com.tray.bean.SerachResult;import com.tray.common.tools.DateFormater; public class LuceneSearch {         private static String DISC_URL = "/home/indexData/data";         static {        String os = System.getProperty("os.name");          if(os.toLowerCase().startsWith("win")){              DISC_URL = "E:\\indexData\\data";         }        else{            DISC_URL ="/home/indexData/data";        }    }             //指定分词器     private Analyzer analyzer=new IKAnalyzer();     private static Directory directory;    //配置    private static IndexWriterConfig iwConfig;    //配置IndexWriter    private static IndexWriter writer;      private static File indexFile = null;           private static Version version = Version.LUCENE_36;         private final int PAPGESIZE=10;     /**     * 全量索引     * @Author haoning     */    public void init() throws Exception {                 try {            indexFile = new File(DISC_URL);            if (!indexFile.exists()) {                indexFile.mkdir();            }            directory=FSDirectory.open(indexFile);              //配置IndexWriterConfig              iwConfig = new IndexWriterConfig(version,analyzer);              iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);                  //创建写索引对象              writer = new IndexWriter(directory,iwConfig);           } catch (Exception e) {        }    }         public void closeWriter(){        try {            writer.close();        } catch (CorruptIndexException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }    }         public void commit(){                 try {            writer.commit();        } catch (CorruptIndexException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }    }         /**     * 一个一个索引     * @Author haoning     */    public void singleIndex(Document doc) throws Exception {        writer.addDocument(doc);    }         /**     * 一个跟新     * @Author haoning     */    public void singleUpdate(Document doc) throws Exception {        Term term = new Term("url", doc.get("url"));        writer.updateDocument(term,doc);    }         /**     * 全量索引     * @Author haoning     */    public void fullIndex(Document[] documentes) throws Exception {                 writer.deleteAll();        for (Document document : documentes) {            writer.addDocument(document);        }        writer.commit();    }         /**     * 根据id删除索引     * @Author haoning     */    public void deleteIndex(Document document)throws Exception{        Term term = new Term("url", document.get("url"));//url才是唯一标志        writer.deleteDocuments(term);        writer.commit();    }         /**     * 根据id增量索引     * @Author haoning     */    public void updateIndex(Document[] documentes) throws Exception{        for (Document document : documentes) {            Term term = new Term("url", document.get("url"));            writer.updateDocument(term, document);        }        writer.commit();    }         /**     * 直接查询     * @Author haoning     */    public void simpleSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{        File indexDir = new File(DISC_URL);          //索引目录          Directory dir=FSDirectory.open(indexDir);          //根据索引目录创建读索引对象          IndexReader reader = IndexReader.open(dir);          //搜索对象创建          IndexSearcher searcher = new IndexSearcher(reader);        TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false);                 Term term = new Term(filedStr, queryStr);        Query query = new TermQuery(term);        searcher.search(query, topCollector);        ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs;                 printScoreDoc(docs, searcher);    }         /**     * 高亮查询     * @Author haoning     */    public Map
highLightSearch(String filed,String keyWord,int curpage, int pageSize) throws Exception{ List
list=new ArrayList
(); Map
map = new HashMap
(); if (curpage <= 0) { curpage = 1; } if (pageSize <= 0 || pageSize>20) { pageSize = PAPGESIZE; } File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir);//根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir);//搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); int start = (curpage - 1) * pageSize; Analyzer analyzer = new IKAnalyzer(true); QueryParser queryParser = new QueryParser(Version.LUCENE_36, filed, analyzer); queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = queryParser.parse(keyWord); int hm = start + pageSize; TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); searcher.search(query, res); SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("
", ""); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); long amount = res.getTotalHits(); //long pages = (rowCount - 1) / pageSize + 1; //计算总页数 map.put("amount",amount);//总共多少条记录 TopDocs tds = res.topDocs(start, pageSize); ScoreDoc[] sd = tds.scoreDocs; for (int i = 0; i < sd.length; i++) { Document doc = searcher.doc(sd[i].doc); String temp=doc.get("name"); //做高亮处理 TokenStream ts = analyzer.tokenStream("name", new StringReader(temp)); SerachResult record=new SerachResult(); String name = highlighter.getBestFragment(ts,temp); String skydirverName=doc.get("skydirverName"); String username=doc.get("username"); String shareTime=doc.get("shareTime"); String describ=doc.get("describ"); String typeId=doc.get("typeId"); String id=doc.get("id"); String url=doc.get("url"); record.setName(name); record.setSkydriverName(skydirverName); record.setUsername(username); record.setShareTime(DateFormater.getFormatDate(shareTime,"yyyy-MM-dd HH:mm:ss")); record.setDescrib(describ); record.setTypeId(Integer.parseInt(typeId)); record.setId(new BigInteger(id)); record.setUrl(url); list.add(record); /*System.out.println("name:"+name); System.out.println("skydirverName:"+skydirverName); System.out.println("username:"+username); System.out.println("shareTime:"+shareTime); System.out.println("describ:"+describ); System.out.println("typeId:"+typeId); System.out.println("id:"+id); System.out.println("url:"+url);*/ } map.put("source",list); return map; } /** * 根据前缀查询 * @Author haoning */ public void prefixSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new PrefixQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 通配符查询 * @Author haoning */ public void wildcardSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new WildcardQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 分词查询 * @Author haoning */ public void analyzerSearch(String filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 多属性分词查询 * @Author haoning */ public void multiAnalyzerSearch(String[] filedStr,String queryStr) throws Exception{ File indexDir = new File(DISC_URL); //索引目录 Directory dir=FSDirectory.open(indexDir); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new MultiFieldQueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } public void printScoreDoc(ScoreDoc[] docs,IndexSearcher searcher)throws Exception{ for (int i = 0; i < docs.length; i++) { List
list = searcher.doc(docs[i].doc).getFields(); for (Fieldable fieldable : list) { String fieldName = fieldable.name(); String fieldValue = fieldable.stringValue(); System.out.println(fieldName+" : "+fieldValue); } } }}注意由于去转盘网(http://www.quzhuanpan.com)是部署到linux上的,所以DISC_URL可以更具系统变换,我是通过url来判定索引文件是否唯一的,你可以更具id来判断,具体情况具体对待吧。package com.tray.indexData; import java.sql.SQLException;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import com.mysql.jdbc.Connection;import com.mysql.jdbc.ResultSet;import com.mysql.jdbc.Statement; public class IndexFile { private static Connection conn = null; private static Statement stmt = null; private final int NUM=500000; private LuceneSearch ls; private long count=0; public ResultSet deal6SourceTable(String tableName) throws SQLException{ String sql = "SELECT distinct `NAME`,SKYDRIVER_NAME,USERNAME,SHARE_TIME,DESCRIB,TYPE_ID,ID,URL FROM "+tableName+" where STATUS=1 and TYPE_ID !='-1' and (TYPE_NAME is null or TYPE_NAME!=1) limit "+NUM; //System.out.println(sql); ResultSet rs = (ResultSet) stmt.executeQuery(sql); return rs; } public void update6SourceTable(String tableName) throws SQLException{ Statement st = (Statement) conn.createStatement(); String sql = "update "+tableName+" set TYPE_NAME=1 where STATUS=1 and TYPE_ID !='-1' and (TYPE_NAME is null or TYPE_NAME!=1) limit "+NUM; //System.out.println("update"+sql); try { st.executeUpdate(sql); } catch (SQLException e) { e.printStackTrace(); } } public void indexInit(){//数据库+lcene初始化 conn = (Connection) JdbcUtil.getConnection(); if(conn == null) { try { throw new Exception("数据库连接失败!"); } catch (Exception e) { e.printStackTrace(); } } ls=new LuceneSearch(); try { ls.init(); } catch (Exception e2) { e2.printStackTrace(); } } public void indexEnd(){//数据库+lcene关闭 ls.closeWriter(); try { conn.close();//关闭数据库 } catch (SQLException e) { e.printStackTrace(); } } public void Index6Data() throws SQLException{ try { stmt = (Statement) conn.createStatement(); } catch (SQLException e1) { e1.printStackTrace(); } ResultSet r1=null; ResultSet r2=null; ResultSet r3=null; ResultSet r4=null; ResultSet r5=null; ResultSet r6=null; boolean stop=false; do{ r1=deal6SourceTable("film_and_tv_info"); stop=this.createIndex(r1,ls,"1"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } //System.out.println("stop"+stop); }while(!stop); stop=false; do{ r2=deal6SourceTable("music_and_mv_info"); stop=this.createIndex(r2,ls,"2"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; do{ r3=deal6SourceTable("e_book_info"); stop=this.createIndex(r3,ls,"3"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; do{ r4=deal6SourceTable("bt_file_info"); stop=this.createIndex(r4,ls,"4"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; do{ r5=deal6SourceTable("characteristic_software_info"); stop=this.createIndex(r5,ls,"5"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; do{ r6=deal6SourceTable("source_code_info"); stop=this.createIndex(r6,ls,"6"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; } public ResultSet deal2Share(String tableName) throws SQLException{ String sql = "SELECT distinct NAME,SKYDRIVER_NAME,USERNAME,SHARE_TIME,DESCRIB,TYPE_ID,ID,SHORTURL from "+tableName+" where STATUS=1 and FS_ID ='1' limit "+NUM; //利用FS_ID这个字段,没什么用处 ResultSet rs = (ResultSet) stmt.executeQuery(sql); return rs; } public ResultSet deal3Share(String tableName) throws SQLException{ String sql = "SELECT distinct title,channel,uid,ctime,description,port,id,shorturl from "+tableName+" where name ='1' limit "+NUM; ResultSet rs = (ResultSet) stmt.executeQuery(sql); return rs; } public void Index3Data() throws SQLException{ try { stmt = (Statement) conn.createStatement(); } catch (SQLException e1) { e1.printStackTrace(); } ResultSet r1=null; ResultSet r2=null; ResultSet r3=null; boolean stop=false; do{ r1=deal2Share("share1"); stop=this.createIndex(r1,ls,"7"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } //System.out.println("stop"+stop); }while(!stop); stop=false; do{ r2=deal2Share("share2"); stop=this.createIndex(r2,ls,"8"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; do{ r3=deal3Share("share3"); stop=this.createIndex(r3,ls,"9"); //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引 if(!stop){ ls.commit();//加个判断条件 } }while(!stop); stop=false; } public void update2ShareTable(String tableName) throws SQLException{ Statement st = (Statement) conn.createStatement(); String sql = "update "+tableName+" set FS_ID=0 where STATUS=1 and FS_ID ='1' limit "+NUM; //利用FS_ID这个字段,没什么用处 //System.out.println("update"+sql); try { st.executeUpdate(sql); } catch (SQLException e) { e.printStackTrace(); } } public void update3ShareTable(String tableName) throws SQLException{ Statement st = (Statement) conn.createStatement(); String sql = "update "+tableName+" set name=0 where name ='1' limit "+NUM; //System.out.println("update"+sql); try { st.executeUpdate(sql); } catch (SQLException e) { e.printStackTrace(); } } public boolean createIndex(ResultSet rs,LuceneSearch ls,String mark) { try { String tableName=null; if(mark.equals("1")){ tableName="film_and_tv_info"; } if(mark.equals("2")){ tableName="music_and_mv_info"; } if(mark.equals("3")){ tableName="e_book_info"; } if(mark.equals("4")){ tableName="bt_file_info"; } if(mark.equals("5")){ tableName="characteristic_software_info"; } if(mark.equals("6")){ tableName="source_code_info"; } if(mark.equals("7")){ tableName="share1"; } if(mark.equals("8")){ tableName="share2"; } if(mark.equals("9")){ tableName="share3"; } boolean isNull=rs.next(); //System.out.println("hehe"+isNull); if(isNull==false){ return true;//处理完毕 } while(isNull){ if(Integer.parseInt(mark)>=1&&Integer.parseInt(mark)<=8){ Document doc = new Document(); //System.out.println("name"+rs.getString("NAME")); Field name = new Field("name",rs.getString("NAME"),Field.Store.YES,Field.Index.ANALYZED); String skName=rs.getString("SKYDRIVER_NAME"); if(skName==null){ skName="百度"; } Field skydirverName = new Field("skydirverName",skName, Field.Store.YES,Field.Index.NOT_ANALYZED); Field username = new Field("username",rs.getString("USERNAME"),Field.Store.YES, Field.Index.ANALYZED); Field shareTime = new Field("shareTime",rs.getString("SHARE_TIME"), Field.Store.YES,Field.Index.NOT_ANALYZED); String desb=rs.getString("DESCRIB"); if(desb==null){ desb="-1"; } Field describ = new Field("describ",desb,Field.Store.NO,Field.Index.NOT_ANALYZED); Field typeId = new Field("typeId",rs.getString("TYPE_ID"), Field.Store.YES,Field.Index.NOT_ANALYZED); Field id = new Field("id",rs.getString("ID"),Field.Store.YES,Field.Index.NOT_ANALYZED); Field url =null; if(Integer.parseInt(mark)>=7&&Integer.parseInt(mark)<=8){ url = new Field("url",rs.getString("SHORTURL"), Field.Store.YES,Field.Index.ANALYZED); } else{ url = new Field("url",rs.getString("URL"), Field.Store.YES,Field.Index.ANALYZED); } doc.add(name); doc.add(skydirverName); doc.add(username); doc.add(shareTime); doc.add(describ); doc.add(typeId); doc.add(id); doc.add(url); ls.singleUpdate(doc);//用跟新更为合适 isNull=rs.next(); } else{ Document doc = new Document(); //System.out.println("title"+rs.getString("title")); Field name = new Field("name",rs.getString("title"),Field.Store.YES,Field.Index.ANALYZED); String skName=rs.getString("channel"); Field skydirverName = new Field("skydirverName",skName, Field.Store.YES,Field.Index.NOT_ANALYZED); Field username = new Field("username",rs.getString("uid"),Field.Store.YES, Field.Index.ANALYZED); Field shareTime = new Field("shareTime",rs.getString("ctime"), Field.Store.YES,Field.Index.NOT_ANALYZED); String desb=rs.getString("description"); if(desb==null){ desb="-1"; } Field describ = new Field("describ",desb,Field.Store.NO,Field.Index.NOT_ANALYZED); Field typeId = new Field("typeId",rs.getString("port"), Field.Store.YES,Field.Index.NOT_ANALYZED); Field id = new Field("id",rs.getString("id"),Field.Store.YES,Field.Index.NOT_ANALYZED); Field url = new Field("url",rs.getString("shorturl"), Field.Store.YES,Field.Index.ANALYZED); doc.add(name); doc.add(skydirverName); doc.add(username); doc.add(shareTime); doc.add(describ); doc.add(typeId); doc.add(id); doc.add(url); ls.singleUpdate(doc);//用跟新更为合适 isNull=rs.next(); } count=count+1; } if(Integer.parseInt(mark)>=1&&Integer.parseInt(mark)<=6){ update6SourceTable(tableName);//处理完成后做标志 } else if(Integer.parseInt(mark)>=7&&Integer.parseInt(mark)<=8){ update2ShareTable(tableName);//处理完成后做标志 } else{ update3ShareTable(tableName);//处理完成后做标志 } System.out.println("Has index "+count+"条数据,数据来自表"+tableName); } catch (Exception e) { e.printStackTrace(); } return false; }}数据库之类的请不要关心,看思路即可,你如果需要换成你的即可,这里就不多说了。看最后的部分:package com.tray.indexData; import java.sql.SQLException; public class Application { public static void main(String[] args){ /*IndexFile indexFile=new IndexFile(); indexFile.indexInit(); try { indexFile.Index6Data(); } catch (SQLException e1) { e1.printStackTrace(); } indexFile.indexEnd();*/ IndexFile indexFile1=new IndexFile(); indexFile1.indexInit(); try { indexFile1.Index3Data(); } catch (SQLException e1) { e1.printStackTrace(); } indexFile1.indexEnd(); LuceneSearch lch=new LuceneSearch(); try { long a = System.currentTimeMillis(); lch.highLightSearch("name", "flv", 1,3); long b = System.currentTimeMillis(); long c = b - a; System.out.println("[高级检索花费时间:" + c + "毫秒]"); } catch (Exception e) { e.printStackTrace(); } }}

你可以在一个applicationic程序中开始索引,也可以写个定时器来定时索引,看需求。以上代码是楼主幸苦的作品,转载请不要改动,本人确保代码完全可用。本人建个qq群,欢迎大家一起交流技术, 群号:512245829 喜欢微博的朋友关注:转盘娱乐即可

你可能感兴趣的文章
fzyzojP3412 -- [校内训练20171212]奇数
查看>>
iphone-common-codes-ccteam源代码 CCUIScreen.h
查看>>
iframe通信
查看>>
spark安装部署
查看>>
记录毕业论文 LanguageTool 二次开发时用到的网站
查看>>
MySQL 第二篇
查看>>
前端优化——懒加载篇
查看>>
function/bind的救赎(上)
查看>>
[转] Android 远程图片获取和本地缓存
查看>>
module.exports 和 exports的区别
查看>>
备忘 - Mac下彻底删除MySQL方法
查看>>
codeforces --- Round #250 (Div. 2) A. The Child and Homework
查看>>
System.Web.NullPointerException
查看>>
ISI CVPR journal ranking
查看>>
面试过程中要点总结
查看>>
CSS3实现倒计时
查看>>
第一篇markdown博文
查看>>
Android源码学习之组合模式应用
查看>>
List泛型类的方法及使用
查看>>
HTML特殊字符编码对照表
查看>>