Pay attention to some of the following concepts:
/** * @param urlFile Path for file which consists of URLs to be scraped * @param outputFile File where scrape results will be written * @throws InterruptedException * @throws ExecutionException * @throws TimeoutException */public void scrapeURLs(String urlFile, String outputFile) throws InterruptedException, ExecutionException, TimeoutException { Iterator uiter = null; // Get the URLs from the file try { uiter = FileUtil.getURLIterator( urlFile ); } catch (IOException e1) { e1.printStackTrace(); } // Iterate through all URLs if (uiter != null) { // // Create an ExecutorService using a newFixedThreadPool // ExecutorService executorService = Executors.newFixedThreadPool(10); // // Create a map of Future and URLs // Map<Future, String> tasks = new LinkedHashMap<Future, String>(); // Iterator through all URLs for scraping the web while (uiter.hasNext()) { String urlstr = uiter.next(); // // Create a callable instance which calls the function that invokes the scraping for each URL // and get the content (full or part based on some rules) // Callable callable = new Callable() { public String call() throws Exception { return scrapeIndividualURL(urlstr); } }; // // Submit the task to executorService; At this point the scraping starts // Future future = executorService.submit(callable); tasks.put(future, urlstr); } // // For each task, iterate and get the content; Write the content to a file // tasks.forEach((future, url) -> { try { String content = future.get(120, TimeUnit.SECONDS); writeToFile(url, content, outputFile); } catch (InterruptedException | ExecutionException | TimeoutException e) { e.printStackTrace(); writeToFile(url, "Not Found", outputFile); } }); executorService.shutdown(); } } /** * Scrape the URL * @param urlstr * @return */ public static String scrapeIndividualURls( String urlstr ) { URL url = null; StringBuilder contentb = new StringBuilder(); try { // get URL content url = new URL(urlstr); // Create a URL Connection Object HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // Set the configuration parameters // Note the readTimeOut set to 30 seconds. // This is quite important when you are planning to scrape URLs. conn.setConnectTimeout(100000); conn.setReadTimeout(30000); conn.connect(); // open the stream and put it into BufferedReader InputStream in = null; if (conn.getResponseCode() >= 400) { in = conn.getErrorStream(); } else { BufferedReader br = new BufferedReader(new InputStreamReader( conn.getInputStream())); String inputLine; while ((inputLine = br.readLine()) != null) { contentb.append(inputLine); contentb.append("\n"); } br.close(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return contentb.toString(); } /** * Write to the file * @param url * @param value * @param outputFile */ private void writeToFile(String url, String value, String outputFile) throws IOException { FileWriter fw = new FileWriter( new File( outputFile ), true ); BufferedWriter bw = new BufferedWriter(fw); if( value != null ) { bw.write( url + "\t" + value + "\n" ); } else { bw.write( url + "\t" + "Not Found" + "\n" ); } bw.close(); }
Large language models (LLMs) have fundamentally transformed our digital landscape, powering everything from chatbots and…
As Large Language Models (LLMs) evolve into autonomous agents, understanding agentic workflow design patterns has…
In today's data-driven business landscape, organizations are constantly seeking ways to harness the power of…
In this blog, you would get to know the essential mathematical topics you need to…
This blog represents a list of questions you can ask when thinking like a product…
AI agents are autonomous systems combining three core components: a reasoning engine (powered by LLM),…