网络知识 娱乐 httpclient 爬虫实例——爬取三级中学名

httpclient 爬虫实例——爬取三级中学名

本人在使用 httpclient 的过程中,突然想起来可以爬取一些数据,比如全国的中学名。当然不是空穴来风,之前也做过这方面的爬虫,不过基于selenium 做的 UI 脚本,效率非常慢,而且很不稳定,所以这次采取了接口的形式,果然效率提升了几个档次。一共6万+数据,用了16分钟左右,期间包括数据库的存储。现在分享代码供大家参考。关键信息隐去,大家看一下思路就好了。

  1package practise;
  2
  3import java.util.ArrayList;
  4import java.util.HashMap;
  5import java.util.List;
  6import java.util.Map;
  7import java.util.Set;
  8import java.util.regex.Matcher;
  9import java.util.regex.Pattern;
 10import org.apache.http.client.methods.HttpGet;
 11import net.sf.json.JSONObject;
 12import source.ApiLibrary;
 13import source.Concurrent;
 14
 15public class Crawler extends ApiLibrary {
 16    public static String host = "";
 17    public static Map<String, Integer> countrys = new HashMap<>();
 18    public static Map<String, Integer> citys = new HashMap<>();
 19    public static Map<String, Integer> address = new HashMap<>();
 20    public static Map<String, Integer> school = new HashMap<>();
 21    public static List<String> total = new ArrayList<>();
 22
 23    public static void main(String[] args) {
 24        Crawler crawler = new Crawler();
 25        crawler.getCountry1();// 省份
 26        Set<String> countryId = countrys.keySet();
 27        for (String name : countryId) {
 28            int id = countrys.get(name);
 29            crawler.getCountry2(id);// 市
 30            Set<String> cityId = citys.keySet();
 31            for (String city : cityId) {
 32                int cid = citys.get(city);
 33                crawler.getCountry3(cid);// 县
 34                Set<String> adresss = address.keySet();
 35                for (String adres : adresss) {
 36                    int aid = address.get(adres);
 37                    crawler.getCountry4(aid);// 名
 38                    Set<String> schol = school.keySet();
 39                    for (String sch : schol) {
 40                        String line = name + PART + city + PART + adres + PART + sch;
 41                        total.add(line);
 42                    }
 43                }
 44            }
 45        }
 46        Concurrent.saveRequestTimes(total);
 47        testOver();
 48    }
 49
 50    /**
 51     * 查询省份
 52     */
 53    public void getCountry1() {
 54        String url = host + "/user/editinfo/getSchollCountryList";
 55        HttpGet httpGet = getHttpGet(url);
 56        // httpGet.addHeader("Cookie", cookies);
 57        // httpGet.addHeader("User-Agent", userangent);
 58        JSONObject response = getHttpResponseEntityByJson(httpGet);
 59        String[] country = response.getString("content").split("</a>");
 60        int size = country.length;
 61        for (int i = 0; i < size; i++) {
 62            String msg = country[i];
 63            int code = getCode(msg);
 64            String name = getName(msg);
 65            countrys.put(name, code);
 66        }
 67    }
 68
 69    /**
 70     * 查询市
 71     * 
 72     * @param id
 73     */
 74    public void getCountry2(int id) {
 75        String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
 76        HttpGet httpGet = getHttpGet(url);
 77        JSONObject response = getHttpResponseEntityByJson(httpGet);
 78        String[] ssString = response.getString("content").split("</a>");
 79        int size = ssString.length;
 80        citys.clear();
 81        for (int i = 0; i < size; i++) {
 82            String msg = ssString[i];
 83            int code = getCode(msg);
 84            String name = getName(msg);
 85            citys.put(name, code);
 86        }
 87
 88    }
 89
 90    /**
 91     * 查询县
 92     * 
 93     * @param id
 94     */
 95    public void getCountry3(int id) {
 96        String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
 97        HttpGet httpGet = getHttpGet(url);
 98        JSONObject response = getHttpResponseEntityByJson(httpGet);
 99        String[] ssString = response.getString("content").split("</a>");
100        int size = ssString.length;
101        address.clear();
102        for (int i = 0; i < size; i++) {
103            String msg = ssString[i];
104            int code = getCode(msg);
105            String name = getName(msg);
106            address.put(name, code);
107        }
108    }
109
110    /**
111     * 查询学校
112     * 
113     * @param id
114     */
115    public void getCountry4(int id) {
116        String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
117        HttpGet httpGet = getHttpGet(url);
118        JSONObject response = getHttpResponseEntityByJson(httpGet);
119        String[] ssString = response.getString("content").split("</a>");
120        int size = ssString.length;
121        school.clear();
122        for (int i = 0; i < size; i++) {
123            String msg = ssString[i];
124            int code = getCode(msg);
125            String name = getName(msg);
126            school.put(name, code);
127        }
128    }
129
130    /**
131     * 获取 code
132     * 
133     * @param text
134     * @return
135     */
136    public int getCode(String text) {
137        int code = 0;
138        Pattern pattern = Pattern.compile(""\d+"");
139        Matcher matcher = pattern.matcher(text);
140        if (matcher.find()) {
141            code = changeStringToInt(matcher.group(0).replace(""", ""));
142        }
143        return code;
144    }
145
146    /**
147     * 获取名称
148     * 
149     * @param text
150     * @return
151     */
152    public String getName(String text) {
153        String name = text.substring(text.lastIndexOf(">") + 1, text.length());
154        return name;
155    }
156
157}

下面是爬取到数据截图

技术类文章精选

  • java一行代码打印心形
  • Linux性能监控软件netdata中文汉化版
  • 接口测试代码覆盖率(jacoco)方案分享
  • 性能测试框架
  • 如何在Linux命令行界面愉快进行性能测试
  • 图解HTTP脑图
  • 将swagger文档自动变成测试代码
  • 五行代码构建静态博客
  • 基于java的直线型接口测试框架初探
  • 单点登录性能测试方案

非技术文章精选

  • 为什么选择软件测试作为职业道路?
  • 写给所有人的编程思维
  • 自动化测试的障碍
  • 自动化测试的问题所在
  • 成为优秀自动化测试工程师的7个步骤
  • 未来10年软件测试的新趋势-上
  • 未来10年软件测试的新趋势-上
  • 手动测试存在的重要原因

大咖风采

  • Tcloud 云测平台--集大成者
  • Android App 测试工具及知识大集合
  • 4399AT UI自动化CI与CD
  • Android App常规测试内容
  • JVM的对象和堆

点击查看公众号地图