打造属于自己的搜索引擎(二)

在前面中已经把爬取的数据存入到es,接下来就是进行前端和写es的分词规则和索引的步骤了(额,我是先创建django项目的)

创建django项目

先给出我的项目目录结构吧

一、写es分词和索引规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
response = client.search( 
index = "segmentfault",
body = {
"query":{
"bool":{
"should": [
{"multi_match": {
"query": key_words,#这里很关键就是查询的语句,即es会根据你的分词进行查询
"fields": ["title", "content"]#设置需要进行分词的字段
}},
{"fuzzy": {
"title": {
"value": key_words,
"fuzziness": 2
}
}}
]
}
},
"from":(page-1)*10,#这里是es分片的,类似我们常说的分页
"size":10,
"highlight":{
"pre_tags":["<span class='keyWord'>"], #可以指定关键词使用的html标签
"post_tags":["</span>"],
"fields":{
"title":{},
"content":{},
}
}
}
)

对数据进行处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
for hit in response["hits"]["hits"]: # 对查询的数据进行处理
hit_dict = {}
if "title" in hit["highlight"]:
hit_dict["title"] = "".join(hit["highlight"]["title"]) # 获取title
else:
hit_dict["title"] = hit["_source"]["title"]
if "content" in hit["highlight"]:
#texts = hit["highlight"]["content"]
#t1=texts[:500]
#print(t1)

hit_dict["content"] = "".join(hit["highlight"]["content"])[:500]# 获取内容,取前500个
else:
hit_dict["content"] = hit["_source"]["content"][:500]

hit_dict["time"] = hit["_source"]["time"] # 获取发布时间
hit_dict["url"] = hit["_source"]["url"] # 获取跳转链接
hit_dict["score"] = hit["_score"] # 获取分数

hit_list.append(hit_dict)

二、编写前端页面并进行数据的接入工作

这里主要是对urls和views这两个文件进行操作

urls

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from django.conf.urls import url
from django.contrib import admin
from django.views.generic import TemplateView
from search.views import SearchSuggest, SearchView,IndexView



urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^$', IndexView.as_view(), name='index'),
url(r'^suggest/$', SearchSuggest.as_view(), name="suggest"),
url(r'^search/$', SearchView.as_view(), name="search"),

]

views.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
from django.shortcuts import render
from django.views.generic.base import View
from search.models import ArticleType
from django.http import HttpResponse
from elasticsearch import Elasticsearch
from datetime import datetime
import redis

client = Elasticsearch(hosts=["127.0.0.1"])

redis_cli = redis.StrictRedis()
class IndexView(View):
def get(self,request):
topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)

return render(request,"index.html",{"topn_search": topn_search})

# Create your views here.
class SearchSuggest(View):
def get(self, request):
key_words = request.GET.get('s', '')
re_datas = []
if key_words:
s = ArticleType.search()
s = s.suggest('my_suggest', key_words, completion={
"field":"suggest","fuzzy": {
"fuzziness": 2
},
"size": 10
})
suggestions = s.execute_suggest()
for match in suggestions.my_suggest[0].options:
source = match._source
re_datas.append(source["title"]) # 获取查询结果title值
return HttpResponse(json.dumps(re_datas),content_type="application/json")





class SearchView(View):
def get(self,request):
key_words = request.GET.get("q","")
#s_type = request.GET.get("s_type","article")
# 热门搜索设置和排序
redis_cli.zincrby("search_keywords_set", 1, key_words) # redis最新版本参数坑
topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)
page = request.GET.get("p","1") # 获取分页数据
try:
page = int(page) # 进行转换
except:
page = 1
pm_count = redis_cli.get("pm_count") # 获取爬虫数量
start_time = datetime.now() # 获取查询开始时time
response = client.search( # 允许我们编写kibana查询语句
index = "segmentfault",
body = {
"query":{
"bool":{
"should": [
{"multi_match": {
"query": key_words,
"fields": ["title", "content"]
}},
{"fuzzy": {
"title": {
"value": key_words,
"fuzziness": 2
}
}}
]
}
},
"from":(page-1)*10,
"size":10,
"highlight":{
"pre_tags":["<span class='keyWord'>"], #可以指定关键词使用的html标签
"post_tags":["</span>"],
"fields":{
"title":{},
"content":{},
}
}
}
)

end_time = datetime.now()
last_seconds = (end_time - start_time) # 获取查询时间
total_nums = response["hits"]["total"] # 获取查询到的值
if (page%10) > 0: # 计算页码
page_nums = int(total_nums/10) +1
else:
page_nums = int(total_nums/10)

hit_list = []
for hit in response["hits"]["hits"]: # 对查询的数据进行处理
hit_dict = {}
if "title" in hit["highlight"]:
hit_dict["title"] = "".join(hit["highlight"]["title"]) # 获取title
else:
hit_dict["title"] = hit["_source"]["title"]
if "content" in hit["highlight"]:
#texts = hit["highlight"]["content"]
#t1=texts[:500]
#print(t1)

hit_dict["content"] = "".join(hit["highlight"]["content"])[:500]# 获取内容,取前500个
else:
hit_dict["content"] = hit["_source"]["content"][:500]

hit_dict["time"] = hit["_source"]["time"] # 获取发布时间
hit_dict["url"] = hit["_source"]["url"] # 获取跳转链接
hit_dict["score"] = hit["_score"] # 获取分数

hit_list.append(hit_dict)

return render(request,"result.html",{"page":page,
"all_hits":hit_list,
"key_words":key_words,
"total_nums":total_nums,
"page_nums":page_nums,
"last_seconds":last_seconds,
"pm_count": pm_count,
"topn_search": topn_search}) # 将后端获取的数据传递给前端页面

对了,在search的目录先也要创建一个modes把前面es_type的代码拷过去就行了(作用跟之前的一样)

前端的页面如下:

搜索页面

结果页面

额,前端的源码就不在这展示了,感觉好费(就是懒…)到时候我会上传GitHub,这里主要是把步骤,关键的部分提一下

通过传入的数据用js实现我的搜索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

var searchArr;
//定义一个search的,判断浏览器有无数据存储(搜索历史)
if(localStorage.search){
//如果有,转换成 数组的形式存放到searchArr的数组里(localStorage以字符串的形式存储,所以要把它转换成数组的形式)
searchArr= localStorage.search.split(",")
}else{
//如果没有,则定义searchArr为一个空的数组
searchArr = [];
}
MapSearchArr();

function add_search(){
var val = $(".searchInput").val();
if (val.length>=2){
//点击搜索按钮时,去重
KillRepeat(val);
//去重后把数组存储到浏览器localStorage
localStorage.search = searchArr;
//然后再把搜索内容显示出来
MapSearchArr();
}

window.location.href=search_url+'?q='+val+"&s_type="+$(".searchItem.current").attr('data-type')

}

function MapSearchArr(){
var tmpHtml = "";
var arrLen = 0
if (searchArr.length >= 5){
arrLen = 5
}else {
arrLen = searchArr.length
}
for (var i=0;i<arrLen;i++){
tmpHtml += '<a href="'+search_url+'?q='+searchArr[i]+'">'+searchArr[i]+'</a>'
}
$(".mysearch .all-search").html(tmpHtml);
}

通过redis的zrevrangebyscore函数计算分值的方式,以分值高的为热门来实现热门搜索

1
2
3
4
5
6
7
8
class IndexView(View):
def get(self,request):
topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)

return render(request,"index.html",{"topn_search": topn_search})
# 热门搜索设置和排序
redis_cli.zincrby("search_keywords_set", 1, key_words) # redis最新版本参数坑
topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)

总结

熟悉了打造搜索引擎的流程及相关的知识,对request、response、post、get等知识有了一定的理解.不过这只是爬取了一个网站的信息,后期的话有了第一个可以慢慢来增加都行.

效果展示