mirror of
https://github.com/mirror/jdownloader.git
synced 2024-11-27 05:50:51 +00:00
*Plugins: Fixes/Changes/Maintenance*
Twitter: - TwitterComCrawler: fixed stop logic when "maxitems" or "max_date" is given in added profile-URL RE forum 93866 - TwitterConfigInterface: Updated GUI translation of FilenameSchemes to clarify which parts of those filename-schemes are optional git-svn-id: svn://svn.jdownloader.org/jdownloader/trunk@48000 ebf7c1c2-ba36-0410-9fe8-c592906822b4 Former-commit-id: 8a2b4af64135b30e06bd9205f5c1e345a1db3945
This commit is contained in:
parent
addae2c37f
commit
84d2687244
@ -165,7 +165,10 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
query = UrlQuery.parse(param.getCryptedUrl());
|
||||
}
|
||||
try {
|
||||
maxTweetsToCrawl = Integer.parseInt(query.get("maxitems"));
|
||||
final int maxItemsToCrawlTmp = Integer.parseInt(query.get("maxitems"));
|
||||
if (maxItemsToCrawlTmp > 0) {
|
||||
maxTweetsToCrawl = maxItemsToCrawlTmp;
|
||||
}
|
||||
} catch (final Throwable ignore) {
|
||||
}
|
||||
final String maxTweetDateStrTmp = query.get("max_date");
|
||||
@ -179,14 +182,17 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
}
|
||||
}
|
||||
try {
|
||||
preGivenPageNumber = Integer.parseInt(query.get("page"));
|
||||
final int preGivenPageNumberTmp = Integer.parseInt(query.get("page"));
|
||||
if (preGivenPageNumberTmp > 0) {
|
||||
preGivenPageNumber = preGivenPageNumberTmp;
|
||||
}
|
||||
preGivenNumberOfTotalWalkedThroughTweetsCount = Integer.parseInt(query.get("totalCrawledTweetsCount"));
|
||||
preGivenNextCursor = Encoding.htmlDecode(query.get("nextCursor"));
|
||||
logger.info("Resuming from last state: page = " + preGivenPageNumber + " | totalCrawledTweetsCount = " + preGivenNumberOfTotalWalkedThroughTweetsCount + " | nextCursor = " + preGivenNextCursor);
|
||||
} catch (final Throwable ignore) {
|
||||
}
|
||||
br.setAllowedResponseCodes(new int[] { 429 });
|
||||
final String newURL = param.getCryptedUrl().replaceFirst("https?://(www\\.|mobile\\.)?twitter\\.com/", "https://" + this.getHost() + "/");
|
||||
final String newURL = param.getCryptedUrl().replaceFirst("(?i)https?://(www\\.|mobile\\.)?twitter\\.com/", "https://" + this.getHost() + "/");
|
||||
if (!newURL.equals(param.getCryptedUrl())) {
|
||||
logger.info("Currected URL: Old: " + param.getCryptedUrl() + " | New: " + newURL);
|
||||
param.setCryptedUrl(newURL);
|
||||
@ -364,7 +370,9 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
if (tweet == null) {
|
||||
continue;
|
||||
}
|
||||
ret.addAll(crawlTweetMap(tweet, usr, fp));
|
||||
final ArrayList<DownloadLink> thisResults = crawlTweetMap(tweet, usr, fp);
|
||||
ret.addAll(thisResults);
|
||||
distribute(thisResults);
|
||||
} else if (typename.equalsIgnoreCase("TweetTombstone")) {
|
||||
/* TODO: Check if this handling is working */
|
||||
/* 18+ content. We can find the ID of that tweet but we can't know the name of the user who posted it. */
|
||||
@ -373,9 +381,9 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
if (thisTweetID == null) {
|
||||
throw new PluginException(LinkStatus.ERROR_PLUGIN_DEFECT);
|
||||
} else if (thisTweetID.equals(tweetID)) {
|
||||
/*
|
||||
* The tweet which we are crawling at this moment -> Account required to view that content --> Mostly this
|
||||
* happens with mature content.
|
||||
/**
|
||||
* ID of the Tweet which we are crawling at this moment -> Account required to view that content </br>
|
||||
* --> Mostly this happens with mature content.
|
||||
*/
|
||||
throw new AccountRequiredException();
|
||||
}
|
||||
@ -596,7 +604,6 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
if (mediaLists.size() > 0) {
|
||||
final List<String> mediaTypesVideo = Arrays.asList(new String[] { "animated_gif", "video" });
|
||||
final String mediaTypePhoto = "photo";
|
||||
final Set<String> foundMediaTypes = new HashSet<String>();
|
||||
final Map<String, DownloadLink> mediaResultMap = new LinkedHashMap<String, DownloadLink>();
|
||||
final Set<String> videoIDs = new HashSet<String>();
|
||||
for (final List<Map<String, Object>> mediaList : mediaLists) {
|
||||
@ -607,7 +614,6 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
if (mediaResultMap.containsKey(keyForMap)) {
|
||||
continue;
|
||||
}
|
||||
foundMediaTypes.add(mediaType);
|
||||
final DownloadLink dl;
|
||||
if (mediaTypesVideo.contains(mediaType)) {
|
||||
videoIDs.add(mediaIDStr);
|
||||
@ -682,7 +688,6 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
}
|
||||
logger.info("Skipped thumbnails: " + numberofSkippedVideoThumbnails);
|
||||
}
|
||||
logger.info("Found media types: " + foundMediaTypes);
|
||||
/* Add results to list to be returned later. */
|
||||
retMedia.addAll(mediaResultMap.values());
|
||||
}
|
||||
@ -764,6 +769,7 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
text.setProperty(PROPERTY_MEDIA_INDEX, 0);
|
||||
text.setProperty(PROPERTY_TYPE, TYPE_TEXT);
|
||||
text.setAvailable(true);
|
||||
// text.setEnabled(false);
|
||||
retInternal.add(text);
|
||||
} else {
|
||||
itemsSkippedDueToPluginSettings++;
|
||||
@ -793,7 +799,6 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
retAll.addAll(retInternal);
|
||||
retAll.addAll(retExternal);
|
||||
fp.addLinks(retAll);
|
||||
this.distribute(retAll);
|
||||
/* Logger just in case nothing was added. */
|
||||
if (retMedia.isEmpty()) {
|
||||
if (itemsSkippedDueToPluginSettings == 0) {
|
||||
@ -1057,48 +1062,42 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
final List<Object> pagination_info = (List<Object>) JavaScriptEngineFactory.walkJson(root, "timeline/instructions/{0}/addEntries/entries");
|
||||
final Map<String, Object> tweetMap = (Map<String, Object>) globalObjects.get("tweets");
|
||||
final Iterator<Entry<String, Object>> iterator = tweetMap.entrySet().iterator();
|
||||
String lastCreatedAtDateStr = null;
|
||||
boolean stopBecauseReachedUserDefinedMaxItemsLimit = false;
|
||||
boolean stopBecauseReachedUserDefinedMaxDate = false;
|
||||
Long lastCrawledTweetTimestamp = null;
|
||||
tweetItemsLoop: while (iterator.hasNext()) {
|
||||
final List<DownloadLink> allowedResults = new ArrayList<DownloadLink>();
|
||||
final HashSet<DownloadLink> skippedResultsByMaxItems = new HashSet<DownloadLink>();
|
||||
final HashSet<DownloadLink> skippedResultsByMaxDate = new HashSet<DownloadLink>();
|
||||
while (iterator.hasNext()) {
|
||||
final Map<String, Object> tweet = (Map<String, Object>) iterator.next().getValue();
|
||||
final Map<String, Object> userWhoPostedThisTweet = (Map<String, Object>) users.get(tweet.get("user_id_str").toString());
|
||||
final List<DownloadLink> results = crawlTweetMap(tweet, userWhoPostedThisTweet, fp);
|
||||
/* Count Tweet as crawled either way. If our array of results is empty this is due to the users' settings. */
|
||||
totalCrawledTweetsCount++;
|
||||
if (results.size() > 0) {
|
||||
ret.addAll(results);
|
||||
for (final DownloadLink thisTweetResult : results) {
|
||||
/* Find timestamp of last added result. Ignore pinned tweets. */
|
||||
final String tweetID = thisTweetResult.getStringProperty(PROPERTY_TWEET_ID);
|
||||
if (pinned_tweet_ids_str == null || (tweetID != null && !pinned_tweet_ids_str.contains(tweetID))) {
|
||||
lastCrawledTweetTimestamp = thisTweetResult.getLongProperty(PROPERTY_DATE_TIMESTAMP, -1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* E.g. tweet only consists of text and used has disabled crawling tweet texts. */
|
||||
final List<DownloadLink> thisResults = crawlTweetMap(tweet, userWhoPostedThisTweet, fp);
|
||||
if (thisResults.isEmpty()) {
|
||||
logger.info("Found nothing for tweet: " + tweet);
|
||||
continue;
|
||||
}
|
||||
lastCreatedAtDateStr = (String) tweet.get("created_at");
|
||||
/*
|
||||
* Set stop conditions here but don't break out of the main loop yet. The reason for this is that we want to have the
|
||||
* important log statement that comes next to this loop!
|
||||
*/
|
||||
if (this.maxTweetsToCrawl != null && totalCrawledTweetsCount >= this.maxTweetsToCrawl.intValue()) {
|
||||
stopBecauseReachedUserDefinedMaxItemsLimit = true;
|
||||
break tweetItemsLoop;
|
||||
for (final DownloadLink thisTweetResult : thisResults) {
|
||||
/* Find timestamp of last added result. Ignore pinned tweets. */
|
||||
final String tweetID = thisTweetResult.getStringProperty(PROPERTY_TWEET_ID);
|
||||
if (pinned_tweet_ids_str == null || (tweetID != null && !pinned_tweet_ids_str.contains(tweetID))) {
|
||||
lastCrawledTweetTimestamp = thisTweetResult.getLongProperty(PROPERTY_DATE_TIMESTAMP, -1);
|
||||
}
|
||||
}
|
||||
if (this.maxTweetsToCrawl != null && totalCrawledTweetsCount == this.maxTweetsToCrawl.intValue()) {
|
||||
skippedResultsByMaxItems.addAll(thisResults);
|
||||
} else if (this.crawlUntilTimestamp != null && lastCrawledTweetTimestamp != null && lastCrawledTweetTimestamp < crawlUntilTimestamp) {
|
||||
stopBecauseReachedUserDefinedMaxDate = true;
|
||||
break tweetItemsLoop;
|
||||
skippedResultsByMaxDate.addAll(thisResults);
|
||||
} else {
|
||||
totalCrawledTweetsCount++;
|
||||
allowedResults.addAll(thisResults);
|
||||
}
|
||||
}
|
||||
logger.info("Crawled page " + page + " | Tweets crawled so far: " + totalCrawledTweetsCount + "/" + maxCount.intValue() + " | lastCreatedAtDateStr = " + lastCreatedAtDateStr + " | last nextCursor = " + nextCursor);
|
||||
logger.info("Crawled page " + page + " | Tweets crawled so far: " + totalCrawledTweetsCount + "/" + maxCount.intValue() + " | last nextCursor = " + nextCursor);
|
||||
distribute(allowedResults);
|
||||
ret.addAll(allowedResults);
|
||||
/* Check abort conditions */
|
||||
if (stopBecauseReachedUserDefinedMaxItemsLimit) {
|
||||
logger.info("Stopping because: Reached user defined max items count: " + maxTweetsToCrawl + " | Actually crawled: " + totalCrawledTweetsCount);
|
||||
if (skippedResultsByMaxItems.size() > 0) {
|
||||
logger.info("Stopping because: Reached user defined max items count: " + maxTweetsToCrawl);
|
||||
break tweetTimeline;
|
||||
} else if (stopBecauseReachedUserDefinedMaxDate) {
|
||||
} else if (skippedResultsByMaxDate.size() > 0) {
|
||||
logger.info("Stopping because: Last item age is older than user defined max age " + this.maxTweetDateStr);
|
||||
break tweetTimeline;
|
||||
} else if (tweetMap.isEmpty()) {
|
||||
@ -1215,7 +1214,9 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
final List<Map<String, Object>> timelineInstructions = (List<Map<String, Object>>) JavaScriptEngineFactory.walkJson(entries, "data/user/result/timeline_v2/timeline/instructions");
|
||||
final int totalCrawledTweetsCountOld = totalCrawledTweetsCount;
|
||||
boolean stopBecauseReachedUserDefinedMaxItemsLimit = false;
|
||||
boolean stopBecauseReachedUserDefinedMaxDate = false;
|
||||
final List<DownloadLink> allowedResults = new ArrayList<DownloadLink>();
|
||||
final HashSet<DownloadLink> skippedResultsByMaxDate = new HashSet<DownloadLink>();
|
||||
int foundTweetsOnCurrentPage = 0;
|
||||
timelineInstructionsLoop: for (final Map<String, Object> timelineInstruction : timelineInstructions) {
|
||||
if (!timelineInstruction.get("type").toString().equalsIgnoreCase("TimelineAddEntries")) {
|
||||
continue;
|
||||
@ -1224,7 +1225,6 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
for (final Map<String, Object> timelineEntry : timelineEntries) {
|
||||
final Map<String, Object> content = (Map<String, Object>) timelineEntry.get("content");
|
||||
final String contentType = (String) content.get("entryType");
|
||||
Long lastCrawledTweetTimestamp = null;
|
||||
if (contentType.equalsIgnoreCase("TimelineTimelineCursor")) {
|
||||
if (content.get("cursorType").toString().equalsIgnoreCase("Bottom")) {
|
||||
nextCursor = content.get("value").toString();
|
||||
@ -1238,22 +1238,33 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
}
|
||||
final String typename = (String) result.get("__typename");
|
||||
if (typename.equalsIgnoreCase("Tweet")) {
|
||||
foundTweetsOnCurrentPage++;
|
||||
final Map<String, Object> usr = (Map<String, Object>) JavaScriptEngineFactory.walkJson(result, "core/user_results/result/legacy");
|
||||
final Map<String, Object> tweet = (Map<String, Object>) result.get("legacy");
|
||||
if (tweet == null) {
|
||||
continue;
|
||||
}
|
||||
final ArrayList<DownloadLink> thisTweetResults = crawlTweetMap(tweet, usr, fp);
|
||||
Long crawledTweetTimestamp = null;
|
||||
for (final DownloadLink thisTweetResult : thisTweetResults) {
|
||||
/* Find timestamp of last added result. Ignore pinned tweets. */
|
||||
final String tweetID = thisTweetResult.getStringProperty(PROPERTY_TWEET_ID);
|
||||
if (pinned_tweet_ids_str == null || (tweetID != null && !pinned_tweet_ids_str.contains(tweetID))) {
|
||||
lastCrawledTweetTimestamp = thisTweetResult.getLongProperty(PROPERTY_DATE_TIMESTAMP, -1);
|
||||
crawledTweetTimestamp = thisTweetResult.getLongProperty(PROPERTY_DATE_TIMESTAMP, -1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
ret.addAll(thisTweetResults);
|
||||
totalCrawledTweetsCount++;
|
||||
if (this.crawlUntilTimestamp != null && crawledTweetTimestamp != null && crawledTweetTimestamp < crawlUntilTimestamp) {
|
||||
skippedResultsByMaxDate.addAll(thisTweetResults);
|
||||
} else if (this.maxTweetsToCrawl != null && totalCrawledTweetsCount == this.maxTweetsToCrawl.intValue()) {
|
||||
stopBecauseReachedUserDefinedMaxItemsLimit = true;
|
||||
break timelineInstructionsLoop;
|
||||
} else {
|
||||
allowedResults.addAll(thisTweetResults);
|
||||
totalCrawledTweetsCount++;
|
||||
}
|
||||
} else if (typename.equalsIgnoreCase("TweetTombstone")) {
|
||||
foundTweetsOnCurrentPage++;
|
||||
/* TODO: Check if this handling is working */
|
||||
/* 18+ content. We can find the ID of that tweet but we can't know the name of the user who posted it. */
|
||||
final String entryId = timelineEntry.get("entryId").toString();
|
||||
@ -1263,32 +1274,29 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
}
|
||||
final DownloadLink link = this.createDownloadlink("https://" + this.getHost() + "/unknowntwitteruser/status/" + thisTweetID);
|
||||
link._setFilePackage(fp);
|
||||
ret.add(link);
|
||||
allowedResults.add(link);
|
||||
totalCrawledTweetsCount++;
|
||||
if (this.maxTweetsToCrawl != null && totalCrawledTweetsCount == this.maxTweetsToCrawl.intValue()) {
|
||||
stopBecauseReachedUserDefinedMaxItemsLimit = true;
|
||||
break timelineInstructionsLoop;
|
||||
}
|
||||
} else {
|
||||
logger.info("Skipping unsupported __typename: " + typename);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (this.maxTweetsToCrawl != null && totalCrawledTweetsCount >= this.maxTweetsToCrawl.intValue()) {
|
||||
stopBecauseReachedUserDefinedMaxItemsLimit = true;
|
||||
break timelineInstructionsLoop;
|
||||
} else if (this.crawlUntilTimestamp != null && lastCrawledTweetTimestamp != null && lastCrawledTweetTimestamp < crawlUntilTimestamp) {
|
||||
stopBecauseReachedUserDefinedMaxDate = true;
|
||||
break timelineInstructionsLoop;
|
||||
}
|
||||
}
|
||||
}
|
||||
final int crawledTweetsThisPage = totalCrawledTweetsCount - totalCrawledTweetsCountOld;
|
||||
totalCrawledTweetsCount += crawledTweetsThisPage;
|
||||
logger.info("Crawled page " + page + " | Found tweets on this page: " + crawledTweetsThisPage + " | Total tweets: " + totalCrawledTweetsCount + " | nextCursor = " + nextCursor);
|
||||
ret.addAll(allowedResults);
|
||||
distribute(allowedResults);
|
||||
logger.info("Crawled page " + page + " | Found Tweets on this page: " + foundTweetsOnCurrentPage + " | Tweets crawled so far: " + totalCrawledTweetsCount + " | nextCursor = " + nextCursor);
|
||||
if (this.isAbort()) {
|
||||
logger.info("Stopping because: Aborted by user");
|
||||
break;
|
||||
} else if (StringUtils.isEmpty(nextCursor)) {
|
||||
logger.info("Stopping because: Failed to find nextCursor");
|
||||
break;
|
||||
} else if (crawledTweetsThisPage == 0) {
|
||||
} else if (foundTweetsOnCurrentPage == 0) {
|
||||
logger.info("Stopping because: Failed to find any items on current page " + page);
|
||||
break;
|
||||
} else if (!cursorDupes.add(nextCursor)) {
|
||||
@ -1297,7 +1305,7 @@ public class TwitterComCrawler extends PluginForDecrypt {
|
||||
} else if (stopBecauseReachedUserDefinedMaxItemsLimit) {
|
||||
logger.info("Stopping because: Reached user defined max items count: " + maxTweetsToCrawl + " | Actually crawled: " + totalCrawledTweetsCount);
|
||||
break;
|
||||
} else if (stopBecauseReachedUserDefinedMaxDate) {
|
||||
} else if (skippedResultsByMaxDate.size() > 0) {
|
||||
logger.info("Stopping because: Last item age is older than user defined max age " + this.maxTweetDateStr);
|
||||
break;
|
||||
} else {
|
||||
|
@ -390,7 +390,7 @@ public class DepositFiles extends antiDDoSForHost {
|
||||
br.followConnection(true);
|
||||
handleDownloadError(br, dl.getConnection(), link);
|
||||
}
|
||||
handleDownload(dl.getConnection(), link, "finallink", downloadURL);
|
||||
handlePreDownloadStuff(dl.getConnection(), link, "finallink", downloadURL);
|
||||
try {
|
||||
/* add a download slot */
|
||||
controlRunningDownloads(account, link, true);
|
||||
@ -500,7 +500,7 @@ public class DepositFiles extends antiDDoSForHost {
|
||||
br.followConnection(true);
|
||||
handleDownloadError(br, dl.getConnection(), link);
|
||||
}
|
||||
handleDownload(dl.getConnection(), link, "finallink", finallink);
|
||||
handlePreDownloadStuff(dl.getConnection(), link, "finallink", finallink);
|
||||
try {
|
||||
/* add a download slot */
|
||||
controlRunningDownloads(account, link, true);
|
||||
@ -512,7 +512,7 @@ public class DepositFiles extends antiDDoSForHost {
|
||||
}
|
||||
}
|
||||
|
||||
private void handleDownload(final URLConnectionAdapter connection, final DownloadLink link, String finalLinkProperty, String finalLink) {
|
||||
private void handlePreDownloadStuff(final URLConnectionAdapter connection, final DownloadLink link, String finalLinkProperty, String finalLink) {
|
||||
final String name = Plugin.getFileNameFromHeader(connection);
|
||||
if (name != null && name.contains("?") && link.getFinalFileName() == null) {
|
||||
/* fix invalid filenames */
|
||||
@ -809,7 +809,7 @@ public class DepositFiles extends antiDDoSForHost {
|
||||
br.followConnection(true);
|
||||
handleDownloadError(br, dl.getConnection(), link);
|
||||
}
|
||||
handleDownload(dl.getConnection(), link, null, url);
|
||||
handlePreDownloadStuff(dl.getConnection(), link, null, url);
|
||||
/* add a download slot */
|
||||
controlRunningDownloads(account, link, true);
|
||||
try {
|
||||
@ -1068,10 +1068,10 @@ public class DepositFiles extends antiDDoSForHost {
|
||||
br.followConnection(true);
|
||||
handleDownloadError(br, dl.getConnection(), link);
|
||||
}
|
||||
handleDownload(dl.getConnection(), link, "finallink", directurl);
|
||||
handlePreDownloadStuff(dl.getConnection(), link, "finallink", directurl);
|
||||
/* Add a download slot */
|
||||
controlRunningDownloads(account, link, true);
|
||||
try {
|
||||
/* add a download slot */
|
||||
controlRunningDownloads(account, link, true);
|
||||
/* start the dl */
|
||||
dl.startDownload();
|
||||
} finally {
|
||||
|
@ -100,19 +100,19 @@ public interface TwitterConfigInterface extends PluginConfigInterface {
|
||||
ORIGINAL_PLUS {
|
||||
@Override
|
||||
public String getLabel() {
|
||||
return "Original+: <date>_<tweet_id>_<originalFilenameWithoutExt>.<ext>";
|
||||
return "Original+: <date>_<tweet_id>[opt:_<originalFilenameWithoutExt>].<ext>";
|
||||
}
|
||||
},
|
||||
ORIGINAL_PLUS_2 {
|
||||
@Override
|
||||
public String getLabel() {
|
||||
return "Original+2: <date>_<username>_<tweet_id>_<originalFilenameWithoutExt>.<ext>";
|
||||
return "Original+2: <date>_<username>_<tweet_id>[opt:_<originalFilenameWithoutExt>].<ext>";
|
||||
}
|
||||
},
|
||||
PLUGIN {
|
||||
@Override
|
||||
public String getLabel() {
|
||||
return "Plugin: <date>_<username>_<tweet_id>_<reply>_<mediaIndex>.<ext>";
|
||||
return "Plugin: <date>_<username>_<tweet_id>[opt:_<reply>][opt:_<mediaIndex>].<ext>";
|
||||
}
|
||||
};
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user