当前位置 博文首页 > 蜗牛为梦想而生H:爬取京东品牌和分类信息

    蜗牛为梦想而生H:爬取京东品牌和分类信息

    作者:[db:作者] 时间:2021-09-07 19:21

    爬取存入数据库?

    /**
     * 京东爬虫依赖
     *
     * <dependency>
     * <groupId>org.jsoup</groupId>
     * <artifactId>jsoup</artifactId>
     * <version>1.11.3</version>
     * </dependency>
     * <p>
     * 爬取京东品牌和分类信息
     */
    @RestController
    public class DemoController {
    
        @Autowired
        CategoryService categoryService;
    
        @Autowired
        BrandService brandService;
    
        @GetMapping("getCategoryFromJD")
        public AxiosResult<Void> setData() throws IOException {
            Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
            Elements elementsByClass = document.getElementsByClass("category-items");
    
            for (int i = 0; i < elementsByClass.size(); i++) {
                Element element = elementsByClass.get(i);
                Elements element1 = element.getElementsByClass("category-item");
                for (int j = 0; j < element1.size(); j++) {
                    Element element2 = element1.get(j);
    
    
                    //一级分类名
                    String firstCategoryName = element2.getElementsByTag("span").text();
                    Category firstCategory = new Category();
                    firstCategory.setCatetoryName(firstCategoryName);
                    firstCategory.setCategoryLevel(1);
                    firstCategory.setParentId(0L);
                    categoryService.save(firstCategory);
    
                    //二级分类
                    Elements dt = element2.getElementsByTag("dl");
                    for (int k = 0; k < dt.size(); k++) {
                        Element element3 = dt.get(k);
                        String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
                        Category sencondCategory = new Category();
                        sencondCategory.setParentId(firstCategory.getId());
                        sencondCategory.setCatetoryName(secondCategoryName);
                        sencondCategory.setCategoryLevel(2);
                        categoryService.save(sencondCategory);
                        Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
                        for (int l = 0; l < a.size(); l++) {
    
                            Category threeCategory = new Category();
                            threeCategory.setParentId(sencondCategory.getId());
                            threeCategory.setCatetoryName(a.get(l).text());
                            threeCategory.setCategoryLevel(3);
                            categoryService.save(threeCategory);
    
                        }
    
    
                    }
    
    
                }
            }
            return AxiosResult.success();
    
        }
    
    
        @GetMapping("getBrandFromJD")
        public AxiosResult<Void> getBrandFromJd() throws Exception {
            Document document = Jsoup.connect("https://www.jd.com/brand.aspx").get();
    
            Elements brandslist = document.getElementsByClass("brandslist");
            for (int i = 0; i < brandslist.size(); i++) {
                Element element1 = brandslist.get(i);
                Elements li = element1.getElementsByTag("li");
                for (int j = 0; j < li.size(); j++) {
                    Element img = li.get(j).getElementsByTag("img").get(0);
                    System.out.println(img);
                    String src = img.attr("src");
                    String alt = img.attr("alt");
                    System.out.println(src);
                    Element span = li.get(j).getElementsByTag("span").get(1).getElementsByTag("a").get(0);
                    String text = span.text();
                    Brand brand = new Brand();
                    brand.setBrandName(text);
                    brand.setBrandDesc(alt);
                    brand.setBrandLogo(src);
                    brand.setBrandSite("http://www.baidu.com");
                    brandService.save(brand);
    
                }
    
    
            }
    
    
            return AxiosResult.success();
    
        }
    
    
    }
    

    ?爬取输出txt

     public static void setData() throws IOException {
            Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
            Elements elementsByClass = document.getElementsByClass("category-items");
            StringBuffer stringBuffer = new StringBuffer();
            StringBuffer append = null;
            for (int i = 0; i < elementsByClass.size(); i++) {
                Element element = elementsByClass.get(i);
                Elements element1 = element.getElementsByClass("category-item");
                for (int j = 0; j < element1.size(); j++) {
                    Element element2 = element1.get(j);
                    //一级分类名
                    String firstCategoryName = element2.getElementsByTag("span").text();
                    //二级分类
                    Elements dt = element2.getElementsByTag("dl");
                    for (int k = 0; k < dt.size(); k++) {
                        Element element3 = dt.get(k);
                        String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
                        Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
                        for (int l = 0; l < a.size(); l++) {
                            System.out.print(a.get(l).text() + " ");
                            String text = a.get(l).text();
                            append = stringBuffer.append(text + " ");
                        }
                        System.out.println("\n");
                        append.append("\r\n");
                    }
                }
            }
            FileOutputStream stream = new FileOutputStream("C://Users//Desktop//京东分类目录.txt");
            byte[] bytes = append.toString().getBytes(StandardCharsets.UTF_8);
            stream.write(bytes);
            stream.close();
        }

    ?

    cs
    下一篇:没有了