forked from gsingers/search_with_machine_learning_course
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex-bbuy.logstash
101 lines (98 loc) · 4.84 KB
/
index-bbuy.logstash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
input {
file {
path => ["/workspace/datasets/product_data/products/*.xml"] #Put in the path to the ungzipped/untarred product data. To start smaller, simply select a smaller number of files or copy a few files to a different directory.
start_position => "beginning"
mode => "read"
exit_after_read => true
sincedb_path => "/dev/null"
file_completed_action => "log"
file_completed_log_path => "/dev/null"
codec => multiline {
pattern => "<product>"
negate => "true"
what => "previous"
}
}
}
filter {
xml {
source => "message"
target => "xml_doc"
store_xml => "false"
xpath => [ # NOTE: this is not a complete list of fields. If you wish to add more, put in the appropriate XPath expression.
#TODO: is there a way to do this using XPath/XSL Functions so that we don't have to maintain a big list?
"/product/productId/text()", "productId",
"/product/sku/text()", "sku",
"/product/name/text()", "name",
"/product/type/text()", "type",
"/product/startDate/text()", "startDate",
"/product/active/text()", "active",
"/product/regularPrice/text()", "regularPrice",
"/product/salePrice/text()", "salePrice",
"/product/onSale/text()", "onSale",
"/product/digital/text()", "digital",
"/product/frequentlyPurchasedWith/*/text()", "frequentlyPurchasedWith",# Note the match all here to get the subfields
"/product/accessories/*/text()", "accessories",# Note the match all here to get the subfields
"/product/relatedProducts/*/text()", "relatedProducts",# Note the match all here to get the subfields
"/product/crossSell/text()", "crossSell",
"/product/salesRankShortTerm/text()", "salesRankShortTerm",
"/product/salesRankMediumTerm/text()", "salesRankMediumTerm",
"/product/salesRankLongTerm/text()", "salesRankLongTerm",
"/product/bestSellingRank/text()", "bestSellingRank",
"/product/url/text()", "url",
"/product/categoryPath/*/name/text()", "categoryPath", # Note the match all here to get the subfields
"/product/categoryPath/*/id/text()", "categoryPathIds", # Note the match all here to get the subfields
"/product/categoryPath/category[last()]/id/text()", "categoryLeaf",
"count(/product/categoryPath/*/name)", "categoryPathCount",
"/product/customerReviewCount/text()", "customerReviewCount",
"/product/customerReviewAverage/text()", "customerReviewAverage",
"/product/inStoreAvailability/text()", "inStoreAvailability",
"/product/onlineAvailability/text()", "onlineAvailability",
"/product/releaseDate/text()", "releaseDate",
"/product/shippingCost/text()", "shippingCost",
"/product/shortDescription/text()", "shortDescription",
"/product/shortDescriptionHtml/text()", "shortDescriptionHtml",
"/product/class/text()", "class",
"/product/classId/text()", "classId",
"/product/subclass/text()", "subclass",
"/product/subclassId/text()", "subclassId",
"/product/department/text()", "department",
"/product/departmentId/text()", "departmentId",
"/product/bestBuyItemId/text()", "bestBuyItemId",
"/product/description/text()", "description",
"/product/manufacturer/text()", "manufacturer",
"/product/modelNumber/text()", "modelNumber",
"/product/image/text()", "image",
"/product/condition/text()", "condition",
"/product/inStorePickup/text()", "inStorePickup",
"/product/homeDelivery/text()", "homeDelivery",
"/product/quantityLimit/text()", "quantityLimit",
"/product/color/text()", "color",
"/product/depth/text()", "depth",
"/product/height/text()", "height",
"/product/weight/text()", "weight",
"/product/shippingWeight/text()", "shippingWeight",
"/product/width/text()", "width",
"/product/longDescription/text()", "longDescription",
"/product/longDescriptionHtml/text()", "longDescriptionHtml",
"/product/features/*/text()", "features" # Note the match all here to get the subfields
]
}
mutate{
remove_field => ["message", "host", "path"] # clean up some stuff we don't need
}
# Drop the root event, which doesn't contain any product information
if ![productId] {
drop{}
}
}
output {
opensearch {
hosts => ["https://localhost:9200"]
index => "bbuy_products"
user => "admin"
password => "admin"
document_id => "%{sku}"
ssl_certificate_verification => false
}
}