南和县住房和建设局网站,锡林郭勒盟建设厅官方网站,新闻国际最新消息,网站建设 客户定位行政区划获取一、导入jar包二、代码展示背景#xff1a;公司的行政区划代码有问题#xff0c;有的没有街道信息#xff0c;有的关联信息有误#xff0c;然后找到了国家的网站国家统计局-行政区划#xff0c;这个里面是包含了所有的行政信息#xff0c;但是全是html页面公司的行政区划代码有问题有的没有街道信息有的关联信息有误然后找到了国家的网站国家统计局-行政区划这个里面是包含了所有的行政信息但是全是html页面这个就没法自动获取了只能去爬取这个数据了java语音有三方类库Jsoup他是一个仿浏览器的三方库可以通过他来获取页面信息。一、导入jar包
下面是笔者用到的全部jar包 dependencygroupIdorg.apache.poi/groupIdartifactIdpoi-ooxml/artifactIdversion3.9/version/dependencydependencygroupIdcom.google.guava/groupIdartifactIdguava/artifactIdversion30.1.1-jre/version/dependencydependencygroupIdcn.hutool/groupIdartifactIdhutool-json/artifactIdversion5.4.0/version/dependencydependencygroupIdcom.alibaba/groupIdartifactIdfastjson/artifactIdversion1.2.44/version/dependencydependencygroupIdorg.jsoup/groupIdartifactIdjsoup/artifactIdversion1.14.3/version/dependencydependencygroupIdorg.apache.httpcomponents/groupIdartifactIdhttpclient/artifactIdversion4.5.5/version/dependencydependencygroupIdorg.springframework.boot/groupIdartifactIdspring-boot-starter-web/artifactIdversion2.5.4/version/dependency这里说下maven配置要从阿里云下载jar若是从中央仓库将会非常的慢。 阿里云私服http://maven.aliyun.com/nexus/content/repositories/central/
二、代码展示
这里是代码的展示笔者是网上搜的代码改造的不然网站有反爬大概爬取2000条左右就会中断笔者加了延时这样就避开了反爬可能还有别的规避措施。这里爬取的是4级行政区划省、市、区县、街道
package com.cheng.controller; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xssf.streaming.SXSSFSheet; import org.apache.poi.xssf.streaming.SXSSFWorkbook; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements;
import java.io.FileOutputStream; import java.io.IOException; import java.net.ConnectException; import java.net.SocketTimeoutException; import java.util.*;
/** author pcc version 1.0.0 className JsoupTest date 2023-03-02 10:39 */ public class JsoupTestPluMdm { static int i 1; static String url1 “http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2022”; static String url2 “”; public static void main(String[] args) throws IOException { try{ListMapString,String listMap new ArrayList();Document document Jsoup.connect(url1).header(Cookie, wzws_sessionidoGQAAyWBMmNlMWZkgjdlZDJkMIAyMjEuMjM4LjEzMi41MA; SF_cookie_115502425; wzws_cid6e8cdc0aea81349b05c8a0b6c05cd7204b6e0f10e5a48d462175473d23abcb4891edf1ceb73464398cb1ce7e6f53999f7545dd0014a15b1fb4eec5c6cf37421f0c2b08528de36f728ec4c676ed264c7d).get();//获取他所有的省Elements elements document.select(body table:nth-child(3) tbody tr:nth-child(1) td table tbody tr:nth-child(2) td table tbody tr td table tbody);//解析省的超链接Elements elements1 elements.select(tbody tr td a);for(int j0;jelements1.size();j){// Thread.sleep(100); String s elements1.get(j).select(“a”).attr(“href”); String provinceCode s.replaceAll(“.html”,“”)“0000”; System.out.println(“省代码:” provinceCode); String provinceName elements1.get(j).text(); System.out.println(“省名称:” provinceName); MapString,String map new HashMap(); map.put(provinceCode,provinceName); listMap.add(map); } for (int i1 0; i1 31; i1) {System.out.println(**********************i********************:i);if(i%10000){Thread.sleep(1000*60*10);}MapString, String stringStringMap listMap.get(i1);IteratorMap.EntryString, String iterator stringStringMap.entrySet().iterator();while(iterator.hasNext()){Map.EntryString,String entry iterator.next();String provinceCode entry.getKey();String provinceName entry.getValue();String index provinceCode.substring(0,2).html;SXSSFWorkbook wb new SXSSFWorkbook(100);SXSSFSheet sheet (SXSSFSheet) wb.createSheet();// TODO 这里改成自己的地址即可也可以存放到一个文件里String enterFileName C:\\Users\\pcc\\Desktop\\xingzhengquhua\\provinceName.xlsx;FileOutputStream fileOut new FileOutputStream(enterFileName);Row row sheet.createRow(0);sheet.createRow(i).createCell(0).setCellValue(provinceCode);// idsheet.getRow(i).createCell(1).setCellValue(provinceName);// namesheet.getRow(i).createCell(2).setCellValue(); // pidsheet.getRow(i).createCell(3).setCellValue(1); // typei;try {jsoupList2(url1 / index, provinceName, provinceCode, sheet);} catch (SocketTimeoutException e) {e.printStackTrace();jsoupList2(url1 / index, provinceName, provinceCode, sheet);} catch (ConnectException e) {e.printStackTrace();jsoupList2(url1 / index, provinceName, provinceCode, sheet);}row.createCell(0).setCellValue(id);row.createCell(1).setCellValue(district_name);row.createCell(2).setCellValue(pid);row.createCell(3).setCellValue(type);wb.write(fileOut);fileOut.close();}}}catch (Exception e){e.printStackTrace();}finally {}}//市级页面
public static void jsoupList2(String url,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception {String cityName ;String cityCode ;url2 url.replace(.html,);Document document Jsoup.connect(url).get();Elements elements document.select(body table:nth-child(3) tbody tr:nth-child(1) td table tbody tr:nth-child(2) td table tbody tr td table tbody);Elements elements1 elements.select(tbody tr td);//j从2开始是因为他有个表头 统计用区划代码 名称for (int j 2; j elements1.size(); j) {System.out.println(**********************i********************:i);if(i%10000){Thread.sleep(1000*60*10);}// Thread.sleep(500); //判断是否是超链接不是超链接也要获取数据 if(elements1.get(j).select(“td a”).toString().equals(“”)){ String text elements1.get(j).text(); if (j % 2 0) { System.out.println(“市代码:” text); sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“市名称:” text); sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(provinceCode); sheet.getRow(i).createCell(3).setCellValue(“3”); i; } }else { Elements elements2 elements1.get(j).select(“td a”); for (int j1 0; j1 elements2.size(); j1) { String text elements2.get(j1).text(); if (j % 2 0) { System.out.println(“市代码:” text); cityCode text; sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“市名称:” text); cityName text; sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(provinceCode); sheet.getRow(i).createCell(3).setCellValue(“2”); i; String s elements2.get(j1).select(“a”).attr(“href”); //TODO 这里排除了海南的几个市区,更改为不排除任何市区 if(true) { try { jsoupList3(url1 “/” s,cityName,cityCode,provinceName,provinceCode, sheet); } catch (SocketTimeoutException e) { e.printStackTrace(); jsoupList3(url1 “/” s,cityName,cityCode,provinceName,provinceCode, sheet); } catch (ConnectException e) { e.printStackTrace(); jsoupList3(url1 “/” s,cityName,cityCode,provinceName,provinceCode, sheet); } } } } } } } //县级页面 public static void jsoupList3(String url,String cityName,String cityCode,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception { Document document Jsoup.connect(url).get(); Elements elements document.select(“body table:nth-child(3) tbody tr:nth-child(1) td table tbody tr:nth-child(2) td table tbody tr td table tbody”); Elements elements1 elements.select(“tbody tr td”); String xianName “”; String xianCode “”; //j从2开始是因为他有个表头 统计用区划代码 名称 for (int j 2; j elements1.size(); j) { System.out.println(“i:i); if(i%10000){ Thread.sleep(10006010); } // Thread.sleep(500); //判断是否是超链接不是超链接也要获取数据 if(elements1.get(j).select(“td a”).toString().equals(”“)){ String text elements1.get(j).text(); if (j % 2 0) { System.out.println(“县代码:” text); sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“县名称:” text); sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(cityCode); sheet.getRow(i).createCell(3).setCellValue(“3”); i; } }else { Elements elements2 elements1.get(j).select(“td a”); for (int j1 0; j1 elements2.size(); j1) { String text elements2.get(j1).text(); xianName text; if (j % 2 0) { xianCode text; System.out.println(“县代码:” xianCode); sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“县名称:” text); sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(cityCode); sheet.getRow(i).createCell(3).setCellValue(“3”); i; String s elements2.get(j1).select(“a”).attr(“href”); try { jsoupList4(url1 “/” provinceCode.substring(0,2)”/“ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet); } catch (SocketTimeoutException e) { e.printStackTrace(); jsoupList4(url1 “/” provinceCode.substring(0,2)”/“ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet); } catch (ConnectException e) { e.printStackTrace(); jsoupList4(url1 “/” provinceCode.substring(0,2)”/ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet); } } } } } }
//街道页面
public static void jsoupList4(String url,String xianName,String xianCode,String cityName,String cityCode,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception {Document document Jsoup.connect(url).get();Elements elements document.select(body table:nth-child(3) tbody tr:nth-child(1) td table tbody tr:nth-child(2) td table tbody tr td table tbody);Elements elements1 elements.select(tbody tr td);//j从2开始是因为他有个表头 统计用区划代码 名称for (int j 2; j elements1.size(); j) {System.out.println(**********************i********************:i);if(i%10000){Thread.sleep(1000*60*10);}// Thread.sleep(500); //判断是否是超链接不是超链接也要获取数据 if(elements1.get(j).select(“td a”).toString().equals(“”)){ String text elements1.get(j).text(); if (j % 2 0) { System.out.println(“街道代码:” text); sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“街道名称:” text); sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(xianCode); sheet.getRow(i).createCell(3).setCellValue(“4”); i; } }else { Elements elements2 elements1.get(j).select(“td a”); for (int j1 0; j1 elements2.size(); j1) { String text elements2.get(j1).text(); if (j % 2 0) { System.out.println(“街道代码:” text);// TODO 这里不能截取不然街道界别数据截不全 sheet.createRow(i).createCell(0).setCellValue(text); } else { System.out.println(“街道名称:” text); sheet.getRow(i).createCell(1).setCellValue(text); sheet.getRow(i).createCell(2).setCellValue(xianCode); sheet.getRow(i).createCell(3).setCellValue(“4”); i; } } } } } }