当前位置 博文首页 > 蜗牛为梦想而生H:爬取京东品牌和分类信息
/**
* 京东爬虫依赖
*
* <dependency>
* <groupId>org.jsoup</groupId>
* <artifactId>jsoup</artifactId>
* <version>1.11.3</version>
* </dependency>
* <p>
* 爬取京东品牌和分类信息
*/
@RestController
public class DemoController {
@Autowired
CategoryService categoryService;
@Autowired
BrandService brandService;
@GetMapping("getCategoryFromJD")
public AxiosResult<Void> setData() throws IOException {
Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
Elements elementsByClass = document.getElementsByClass("category-items");
for (int i = 0; i < elementsByClass.size(); i++) {
Element element = elementsByClass.get(i);
Elements element1 = element.getElementsByClass("category-item");
for (int j = 0; j < element1.size(); j++) {
Element element2 = element1.get(j);
//一级分类名
String firstCategoryName = element2.getElementsByTag("span").text();
Category firstCategory = new Category();
firstCategory.setCatetoryName(firstCategoryName);
firstCategory.setCategoryLevel(1);
firstCategory.setParentId(0L);
categoryService.save(firstCategory);
//二级分类
Elements dt = element2.getElementsByTag("dl");
for (int k = 0; k < dt.size(); k++) {
Element element3 = dt.get(k);
String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
Category sencondCategory = new Category();
sencondCategory.setParentId(firstCategory.getId());
sencondCategory.setCatetoryName(secondCategoryName);
sencondCategory.setCategoryLevel(2);
categoryService.save(sencondCategory);
Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
for (int l = 0; l < a.size(); l++) {
Category threeCategory = new Category();
threeCategory.setParentId(sencondCategory.getId());
threeCategory.setCatetoryName(a.get(l).text());
threeCategory.setCategoryLevel(3);
categoryService.save(threeCategory);
}
}
}
}
return AxiosResult.success();
}
@GetMapping("getBrandFromJD")
public AxiosResult<Void> getBrandFromJd() throws Exception {
Document document = Jsoup.connect("https://www.jd.com/brand.aspx").get();
Elements brandslist = document.getElementsByClass("brandslist");
for (int i = 0; i < brandslist.size(); i++) {
Element element1 = brandslist.get(i);
Elements li = element1.getElementsByTag("li");
for (int j = 0; j < li.size(); j++) {
Element img = li.get(j).getElementsByTag("img").get(0);
System.out.println(img);
String src = img.attr("src");
String alt = img.attr("alt");
System.out.println(src);
Element span = li.get(j).getElementsByTag("span").get(1).getElementsByTag("a").get(0);
String text = span.text();
Brand brand = new Brand();
brand.setBrandName(text);
brand.setBrandDesc(alt);
brand.setBrandLogo(src);
brand.setBrandSite("http://www.baidu.com");
brandService.save(brand);
}
}
return AxiosResult.success();
}
}
public static void setData() throws IOException {
Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
Elements elementsByClass = document.getElementsByClass("category-items");
StringBuffer stringBuffer = new StringBuffer();
StringBuffer append = null;
for (int i = 0; i < elementsByClass.size(); i++) {
Element element = elementsByClass.get(i);
Elements element1 = element.getElementsByClass("category-item");
for (int j = 0; j < element1.size(); j++) {
Element element2 = element1.get(j);
//一级分类名
String firstCategoryName = element2.getElementsByTag("span").text();
//二级分类
Elements dt = element2.getElementsByTag("dl");
for (int k = 0; k < dt.size(); k++) {
Element element3 = dt.get(k);
String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
for (int l = 0; l < a.size(); l++) {
System.out.print(a.get(l).text() + " ");
String text = a.get(l).text();
append = stringBuffer.append(text + " ");
}
System.out.println("\n");
append.append("\r\n");
}
}
}
FileOutputStream stream = new FileOutputStream("C://Users//Desktop//京东分类目录.txt");
byte[] bytes = append.toString().getBytes(StandardCharsets.UTF_8);
stream.write(bytes);
stream.close();
}
?
cs