Skip to content

Commit acf4fbf

Browse files
author
caodongping
committed
article parser
1 parent cb3fccd commit acf4fbf

File tree

6 files changed

+181
-98
lines changed

6 files changed

+181
-98
lines changed

app/src/main/java/com/github/mzule/androidweekly/api/ArticleApi.java

Lines changed: 6 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import android.os.Handler;
55
import android.text.TextUtils;
66

7+
import com.github.mzule.androidweekly.api.parser.ArticleParsers;
78
import com.github.mzule.androidweekly.dao.ArticleDao;
89
import com.github.mzule.androidweekly.dao.IssueListKeeper;
910
import com.github.mzule.androidweekly.entity.Article;
@@ -100,106 +101,13 @@ private Response<List<Issue>> doGetArchive() throws Exception {
100101
}
101102

102103
private Response<List<Object>> doGetPage(String issue) throws Exception {
103-
String url = "http://androidweekly.net";
104-
if (issue != null) {
105-
url += issue;
106-
}
107-
final List<Object> articles = new ArrayList<>();
108-
Document doc = Jsoup.parse(new URL(url), 30000);
109-
if (issue == null || isBiggerThan100(issue)) {
110-
parse(doc, articles, issue);
111-
} else {
112-
Element root = doc.getElementsByClass("issue").get(0);
113-
while (root.children().size() == 1) {
114-
root = root.child(0);
115-
}
116-
String currentSection = null;
117-
for (Element e : root.children()) {
118-
if (e.tagName().equals("h2")) {
119-
currentSection = e.text();
120-
articles.add(currentSection);
121-
continue;
122-
}
123-
if (e.tagName().equals("div")) {
124-
Elements img = e.getElementsByTag("img");
125-
if (!img.isEmpty()) {
126-
Article article = new Article();
127-
article.setImageUrl(img.get(0).attr("src"));
128-
article.setTitle(e.getElementsByTag("a").get(1).text());
129-
article.setLink(e.getElementsByTag("a").get(1).attr("href"));
130-
article.setBrief(e.getElementsByTag("p").get(0).text());
131-
Elements span = e.getElementsByTag("span");
132-
if (!span.isEmpty()) {
133-
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
134-
}
135-
article.setIssue(issue);
136-
article.setSection(currentSection);
137-
articles.add(article);
138-
//articleDao.save(article);
139-
}
140-
} else {
141-
Article article = new Article();
142-
Elements title = e.getElementsByTag("a");
143-
if (title.isEmpty()) {
144-
continue;
145-
}
146-
article.setTitle(title.get(0).text());
147-
Elements span = e.getElementsByTag("span");
148-
if (!span.isEmpty()) {
149-
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
150-
}
151-
article.setLink(e.getElementsByTag("a").get(0).attr("href"));
152-
article.setBrief(e.text());
153-
article.setIssue(issue);
154-
article.setSection(currentSection);
155-
articles.add(article);
156-
//articleDao.save(article);
157-
}
158-
}
159-
}
160-
return new Response<>(articles, false);
161-
}
162-
163-
private boolean isBiggerThan100(String issue) {
164-
String s = issue.split("-")[1];
165-
return Integer.parseInt(s) >= 103;
166-
}
167-
168-
private void parse(Document doc, List<Object> articles, String issue) {
169-
Elements tables = doc.getElementsByTag("table");
170-
String currentSection = null;
171-
for (Element e : tables) {
172-
Elements h2 = e.getElementsByTag("h2");
173-
if (!h2.isEmpty()) {
174-
currentSection = h2.get(0).text();
175-
articles.add(currentSection);
176-
} else {
177-
Elements tds = e.getElementsByTag("td");
178-
Element td = tds.get(tds.size() - 2);
179-
String imageUrl = null;
180-
if (tds.size() == 4) {
181-
imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src");
182-
}
183-
String title = td.getElementsByClass("article-headline").get(0).text();
184-
String brief = td.getElementsByTag("p").get(0).text();
185-
String link = td.getElementsByClass("article-headline").get(0).attr("href");
186-
String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", "");
187-
if (issue == null) {
188-
String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text();
189-
issue = "/issues/issue-" + number.replace("#", "");
190-
}
191-
Article article = new Article();
192-
article.setTitle(title);
193-
article.setBrief(brief);
194-
article.setLink(link);
195-
article.setDomain(domain);
196-
article.setIssue(issue);
197-
article.setImageUrl(imageUrl);
198-
article.setSection(currentSection);
199-
articles.add(article);
200-
articleDao.save(article);
104+
List<Object> result = ArticleParsers.get(issue).parse(issue);
105+
for (Object obj : result) {
106+
if (obj instanceof Article) {
107+
articleDao.save((Article) obj);
201108
}
202109
}
110+
return new Response<>(result, false);
203111
}
204112

205113
private <T> void postSuccess(final Response<T> result, final ApiCallback<T> callback) {
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.github.mzule.androidweekly.api.parser;
2+
3+
import java.io.IOException;
4+
import java.util.List;
5+
6+
/**
7+
* Created by CaoDongping on 4/15/16.
8+
*/
9+
public interface ArticleParser {
10+
List<Object> parse(String issue) throws IOException;
11+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package com.github.mzule.androidweekly.api.parser;
2+
3+
import android.support.annotation.WorkerThread;
4+
5+
/**
6+
* Created by CaoDongping on 4/15/16.
7+
*/
8+
public class ArticleParsers {
9+
@WorkerThread
10+
public static ArticleParser get(String issue) {
11+
if (issue == null || Integer.parseInt(issue.split("-")[1]) > 102) {
12+
return new FresherArticlesParser();
13+
} else {
14+
return new OlderArticlesParser();
15+
}
16+
}
17+
18+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package com.github.mzule.androidweekly.api.parser;
2+
3+
import org.jsoup.Jsoup;
4+
import org.jsoup.nodes.Document;
5+
6+
import java.io.IOException;
7+
import java.net.URL;
8+
9+
/**
10+
* Created by CaoDongping on 4/15/16.
11+
*/
12+
public class DocumentProvider {
13+
public static Document get(String issue) throws IOException {
14+
String url = "http://androidweekly.net/";
15+
if (issue != null) {
16+
url += issue;
17+
}
18+
return Jsoup.parse(new URL(url), 30000);
19+
}
20+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package com.github.mzule.androidweekly.api.parser;
2+
3+
import com.github.mzule.androidweekly.entity.Article;
4+
5+
import org.jsoup.nodes.Document;
6+
import org.jsoup.nodes.Element;
7+
import org.jsoup.select.Elements;
8+
9+
import java.io.IOException;
10+
import java.util.ArrayList;
11+
import java.util.List;
12+
13+
/**
14+
* Created by CaoDongping on 4/15/16.
15+
*/
16+
public class FresherArticlesParser implements ArticleParser {
17+
18+
@Override
19+
public List<Object> parse(String issue) throws IOException {
20+
Document doc = DocumentProvider.get(issue);
21+
List<Object> articles = new ArrayList<>();
22+
Elements tables = doc.getElementsByTag("table");
23+
String currentSection = null;
24+
for (Element e : tables) {
25+
Elements h2 = e.getElementsByTag("h2");
26+
if (!h2.isEmpty()) {
27+
currentSection = h2.get(0).text();
28+
articles.add(currentSection);
29+
} else {
30+
Elements tds = e.getElementsByTag("td");
31+
Element td = tds.get(tds.size() - 2);
32+
String imageUrl = null;
33+
if (tds.size() == 4) {
34+
imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src");
35+
}
36+
String title = td.getElementsByClass("article-headline").get(0).text();
37+
String brief = td.getElementsByTag("p").get(0).text();
38+
String link = td.getElementsByClass("article-headline").get(0).attr("href");
39+
String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", "");
40+
if (issue == null) {
41+
String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text();
42+
issue = "/issues/issue-" + number.replace("#", "");
43+
}
44+
Article article = new Article();
45+
article.setTitle(title);
46+
article.setBrief(brief);
47+
article.setLink(link);
48+
article.setDomain(domain);
49+
article.setIssue(issue);
50+
article.setImageUrl(imageUrl);
51+
article.setSection(currentSection);
52+
articles.add(article);
53+
}
54+
}
55+
return articles;
56+
}
57+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package com.github.mzule.androidweekly.api.parser;
2+
3+
import com.github.mzule.androidweekly.entity.Article;
4+
5+
import org.jsoup.nodes.Document;
6+
import org.jsoup.nodes.Element;
7+
import org.jsoup.select.Elements;
8+
9+
import java.io.IOException;
10+
import java.util.ArrayList;
11+
import java.util.List;
12+
13+
/**
14+
* Created by CaoDongping on 4/15/16.
15+
*/
16+
public class OlderArticlesParser implements ArticleParser {
17+
18+
@Override
19+
public List<Object> parse(String issue) throws IOException {
20+
Document doc = DocumentProvider.get(issue);
21+
List<Object> articles = new ArrayList<>();
22+
Element root = doc.getElementsByClass("issue").get(0);
23+
while (root.children().size() == 1) {
24+
root = root.child(0);
25+
}
26+
String currentSection = null;
27+
for (Element e : root.children()) {
28+
if (e.tagName().equals("h2")) {
29+
currentSection = e.text();
30+
articles.add(currentSection);
31+
continue;
32+
}
33+
if (e.tagName().equals("div")) {
34+
Elements img = e.getElementsByTag("img");
35+
if (!img.isEmpty()) {
36+
Article article = new Article();
37+
article.setImageUrl(img.get(0).attr("src"));
38+
article.setTitle(e.getElementsByTag("a").get(1).text());
39+
article.setLink(e.getElementsByTag("a").get(1).attr("href"));
40+
article.setBrief(e.getElementsByTag("p").get(0).text());
41+
Elements span = e.getElementsByTag("span");
42+
if (!span.isEmpty()) {
43+
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
44+
}
45+
article.setIssue(issue);
46+
article.setSection(currentSection);
47+
articles.add(article);
48+
}
49+
} else {
50+
Article article = new Article();
51+
Elements title = e.getElementsByTag("a");
52+
if (title.isEmpty()) {
53+
continue;
54+
}
55+
article.setTitle(title.get(0).text());
56+
Elements span = e.getElementsByTag("span");
57+
if (!span.isEmpty()) {
58+
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
59+
}
60+
article.setLink(e.getElementsByTag("a").get(0).attr("href"));
61+
article.setBrief(e.text());
62+
article.setIssue(issue);
63+
article.setSection(currentSection);
64+
articles.add(article);
65+
}
66+
}
67+
return articles;
68+
}
69+
}

0 commit comments

Comments
 (0)