Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code smells #1007

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
6 changes: 2 additions & 4 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,10 @@ public class Page {

private byte[] bytes;

private List<Request> targetRequests = new ArrayList<Request>();
private List<Request> targetRequests = new ArrayList<>();

private String charset;

public Page() {
}

public static Page fail(){
Page page = new Page();
Expand Down Expand Up @@ -105,9 +103,9 @@ public Json getJson() {

/**
* @param html html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/

public void setHtml(Html html) {
this.html = html;
}
Expand Down
17 changes: 10 additions & 7 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task {

protected Downloader downloader;

protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected List<Pipeline> pipelines = new ArrayList<>();

protected PageProcessor pageProcessor;

Expand All @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task {

protected boolean exitWhenComplete = true;

protected final static int STAT_INIT = 0;
protected static final int STAT_INIT = 0;

protected final static int STAT_RUNNING = 1;
protected static final int STAT_RUNNING = 1;

protected final static int STAT_STOPPED = 2;
protected static final int STAT_STOPPED = 2;

protected boolean spawnUrl = true;

Expand Down Expand Up @@ -246,7 +246,7 @@ public Spider setPipelines(List<Pipeline> pipelines) {
* @return this
*/
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
pipelines = new ArrayList<>();
return this;
}

Expand Down Expand Up @@ -313,7 +313,8 @@ public void run() {
// wait until new url added
waitNewUrl();
} else {
threadPool.execute(new Runnable() {
threadPool.execute(
new Runnable() {
@Override
public void run() {
try {
Expand Down Expand Up @@ -427,7 +428,6 @@ private void onDownloadSuccess(Request request, Page page) {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}

private void onDownloaderFail(Request request) {
Expand Down Expand Up @@ -458,6 +458,8 @@ protected void sleep(int time) {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
//restore interrupted thread
Thread.currentThread().interrupt();
}
}

Expand Down Expand Up @@ -564,6 +566,7 @@ private void waitNewUrl() {
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e);
Thread.currentThread().interrupt();
} finally {
newUrlLock.unlock();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
*/
@Experimental
public class SmartContentSelector implements Selector {

/***
* Empty/ default constructor for SmartContentSelector
*/
public SmartContentSelector() {
}

Expand All @@ -33,7 +35,7 @@ public String select(String html) {
int start;
int end;
StringBuilder text = new StringBuilder();
ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
ArrayList<Integer> indexDistribution = new ArrayList<>();

lines = Arrays.asList(html.split("\n"));

Expand All @@ -47,39 +49,42 @@ public String select(String html) {
}

start = -1; end = -1;
boolean boolstart = false, boolend = false;
boolean boolstart = false;
boolean boolend = false;
text.setLength(0);

for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && ! boolstart) {
if (indexDistribution.get(i+1).intValue() != 0

int i=0;
while (i < indexDistribution.size() - 1) {

if ((indexDistribution.get(i) > threshold && ! boolstart)
&& (indexDistribution.get(i+1).intValue() != 0
|| indexDistribution.get(i+2).intValue() != 0
|| indexDistribution.get(i+3).intValue() != 0) {
|| indexDistribution.get(i+3).intValue() != 0) ){
boolstart = true;
start = i;
continue;
i++;
}
}
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) {

if ((boolstart) && (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) ){
end = i;
boolend = true;
}
}


StringBuilder tmp = new StringBuilder();
if (boolend) {
//System.out.println(start+1 + "\t\t" + end+1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) continue;
if (lines.get(ii).length() < 5) i++;
tmp.append(lines.get(ii) + "\n");
}
String str = tmp.toString();
//System.out.println(str);
if (str.contains("Copyright") ) continue;

if (str.contains("Copyright")) i++;
text.append(str);
boolstart = boolend = false;
}
i++;
}
return text.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
public class FilePersistentBase {

protected String path;

public FilePersistentBase() {
setPath("/data/webmagic/");
}

public FilePersistentBase(String path) {
setPath(path);
}

public static String PATH_SEPERATOR = "/";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ public class FilePageModelPipeline extends FilePersistentBase implements PageMod
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public FilePageModelPipeline() {
setPath("/data/webmagic/");
super();
}

public FilePageModelPipeline(String path) {
setPath(path);
super(path);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
setPath("/data/webmagic/");
super();
}

public JsonFilePageModelPipeline(String path) {
setPath(path);
super(path);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
*/
public abstract class IPUtils {

public static String getFirstNoLoopbackIPAddresses() throws SocketException {
public static String getFirstNoLoopbackIPAddresses() throws SocketException, NullPointerException{

Enumeration<NetworkInterface> networkInterfaces = NetworkInterface.getNetworkInterfaces();

Expand Down