1998-07-27 22:16:13 +00:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Netscape Public License
|
|
|
|
* Version 1.0 (the "License"); you may not use this file except in
|
|
|
|
* compliance with the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/NPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an "AS IS"
|
|
|
|
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
|
|
|
|
* the License for the specific language governing rights and limitations
|
|
|
|
* under the License.
|
|
|
|
*
|
|
|
|
* The Original Code is Mozilla Communicator client code.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is Netscape Communications
|
|
|
|
* Corporation. Portions created by Netscape are Copyright (C) 1998
|
|
|
|
* Netscape Communications Corporation. All Rights Reserved.
|
|
|
|
*/
|
|
|
|
#ifndef nsWebCrawler_h___
|
|
|
|
#define nsWebCrawler_h___
|
|
|
|
|
|
|
|
#include "nsIBrowserWindow.h"
|
1998-11-20 18:26:41 +00:00
|
|
|
#include "nsIStreamListener.h"
|
1998-07-27 22:16:13 +00:00
|
|
|
#include "nsVoidArray.h"
|
|
|
|
#include "nsString.h"
|
|
|
|
|
|
|
|
class nsIAtom;
|
|
|
|
class nsIContent;
|
|
|
|
class nsIDocument;
|
|
|
|
class nsITimer;
|
|
|
|
class nsIURL;
|
1998-09-02 22:07:42 +00:00
|
|
|
class nsIPresShell;
|
1998-07-27 22:16:13 +00:00
|
|
|
class nsViewerApp;
|
|
|
|
class AtomHashTable;
|
|
|
|
|
1998-11-20 18:26:41 +00:00
|
|
|
class nsWebCrawler : public nsIStreamObserver {
|
1998-07-27 22:16:13 +00:00
|
|
|
public:
|
1999-02-08 17:57:00 +00:00
|
|
|
// Make a new web-crawler for the given viewer. Note: the web
|
|
|
|
// crawler does not addref the viewer.
|
1998-07-27 22:16:13 +00:00
|
|
|
nsWebCrawler(nsViewerApp* aViewer);
|
|
|
|
|
|
|
|
// nsISupports
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
|
|
|
|
// nsIStreamObserver
|
|
|
|
NS_IMETHOD OnStartBinding(nsIURL* aURL, const char *aContentType);
|
1998-12-16 05:40:20 +00:00
|
|
|
NS_IMETHOD OnProgress(nsIURL* aURL, PRUint32 aProgress, PRUint32 aProgressMax);
|
|
|
|
NS_IMETHOD OnStatus(nsIURL* aURL, const PRUnichar* aMsg);
|
|
|
|
NS_IMETHOD OnStopBinding(nsIURL* aURL, nsresult status, const PRUnichar* aMsg);
|
1998-07-27 22:16:13 +00:00
|
|
|
|
|
|
|
// Add a url to load
|
|
|
|
void AddURL(const nsString& aURL);
|
|
|
|
|
|
|
|
// Add a domain that is safe to load url's from
|
|
|
|
void AddSafeDomain(const nsString& aDomain);
|
|
|
|
|
|
|
|
// Add a domain that must be avoided
|
|
|
|
void AddAvoidDomain(const nsString& aDomain);
|
|
|
|
|
1998-09-15 18:06:19 +00:00
|
|
|
void SetBrowserWindow(nsIBrowserWindow* aWindow);
|
1999-02-19 04:24:12 +00:00
|
|
|
void GetBrowserWindow(nsIBrowserWindow** aWindow);
|
1998-07-27 22:16:13 +00:00
|
|
|
|
|
|
|
// Set the delay (by default, the timer is set to one second)
|
|
|
|
void SetDelay(PRInt32 aSeconds) {
|
|
|
|
mDelay = aSeconds;
|
|
|
|
}
|
|
|
|
|
|
|
|
void EnableJiggleLayout() {
|
|
|
|
mJiggleLayout = PR_TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If set to TRUE the loader will post an exit message on exit
|
|
|
|
void SetExitOnDone(PRBool aPostExit) {
|
|
|
|
mPostExit = aPostExit;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start loading documents
|
|
|
|
void Start();
|
|
|
|
|
|
|
|
// Enable the crawler; when a document contains links to other
|
|
|
|
// documents the crawler will go to them subject to the limitations
|
|
|
|
// on the total crawl count and the domain name checks.
|
|
|
|
void EnableCrawler();
|
|
|
|
|
|
|
|
void SetRecordFile(FILE* aFile) {
|
|
|
|
mRecord = aFile;
|
|
|
|
}
|
|
|
|
|
1998-07-27 23:01:06 +00:00
|
|
|
void SetMaxPages(PRInt32 aMax) {
|
|
|
|
mMaxPages = aMax;
|
|
|
|
}
|
|
|
|
|
1998-09-02 22:07:42 +00:00
|
|
|
void SetOutputDir(const nsString& aOutputDir);
|
|
|
|
|
1998-12-10 18:05:59 +00:00
|
|
|
void SetRegressionDir(const nsString& aOutputDir);
|
|
|
|
|
|
|
|
void SetEnableRegression(PRBool aSetting) {
|
|
|
|
mRegressing = aSetting;
|
|
|
|
}
|
|
|
|
|
1998-07-27 22:16:13 +00:00
|
|
|
void LoadNextURL();
|
|
|
|
|
1998-11-19 17:24:13 +00:00
|
|
|
void SetVerbose(PRBool aSetting) {
|
|
|
|
mVerbose = aSetting;
|
|
|
|
}
|
|
|
|
|
1998-11-20 18:26:41 +00:00
|
|
|
void EndLoadURL(nsIWebShell* aShell, const PRUnichar* aURL, PRInt32 aStatus);
|
|
|
|
|
1998-07-27 22:16:13 +00:00
|
|
|
protected:
|
|
|
|
virtual ~nsWebCrawler();
|
|
|
|
|
|
|
|
void FindURLsIn(nsIDocument* aDocument, nsIContent* aNode);
|
|
|
|
|
|
|
|
void FindMoreURLs();
|
|
|
|
|
|
|
|
PRBool OkToLoad(const nsString& aURLSpec);
|
|
|
|
|
|
|
|
void RecordLoadedURL(const nsString& aURLSpec);
|
|
|
|
|
1998-09-02 22:07:42 +00:00
|
|
|
/** generate an output name from a URL */
|
1998-12-10 18:05:59 +00:00
|
|
|
FILE* GetOutputFile(nsIURL *aURL, nsString& aOutputName);
|
1998-09-02 22:07:42 +00:00
|
|
|
|
|
|
|
nsIPresShell* GetPresShell();
|
|
|
|
|
1998-12-10 18:05:59 +00:00
|
|
|
void PerformRegressionTest(const nsString& aOutputName);
|
|
|
|
|
1998-07-27 22:16:13 +00:00
|
|
|
nsIBrowserWindow* mBrowser;
|
|
|
|
nsViewerApp* mViewer;
|
|
|
|
nsITimer* mTimer;
|
|
|
|
FILE* mRecord;
|
|
|
|
nsIAtom* mLinkTag;
|
|
|
|
nsIAtom* mFrameTag;
|
|
|
|
nsIAtom* mIFrameTag;
|
1998-12-20 01:21:23 +00:00
|
|
|
nsIAtom* mHrefAttr;
|
|
|
|
nsIAtom* mSrcAttr;
|
|
|
|
nsIAtom* mBaseHrefAttr;
|
1998-07-27 22:16:13 +00:00
|
|
|
AtomHashTable* mVisited;
|
1998-12-10 18:05:59 +00:00
|
|
|
nsString mOutputDir;
|
1998-07-27 22:16:13 +00:00
|
|
|
|
|
|
|
PRBool mCrawl;
|
|
|
|
PRBool mJiggleLayout;
|
|
|
|
PRBool mPostExit;
|
|
|
|
PRInt32 mDelay;
|
1998-07-27 23:01:06 +00:00
|
|
|
PRInt32 mMaxPages;
|
1998-07-27 22:16:13 +00:00
|
|
|
|
1998-11-19 17:24:13 +00:00
|
|
|
nsString mCurrentURL;
|
|
|
|
PRBool mVerbose;
|
1998-12-10 18:05:59 +00:00
|
|
|
PRBool mRegressing;
|
|
|
|
nsString mRegressionDir;
|
1998-11-19 17:24:13 +00:00
|
|
|
|
1998-07-27 22:16:13 +00:00
|
|
|
nsVoidArray mPendingURLs;
|
|
|
|
nsVoidArray mSafeDomains;
|
|
|
|
nsVoidArray mAvoidDomains;
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* nsWebCrawler_h___ */
|