Pay attention to some of the following concepts:
/** * @param urlFile Path for file which consists of URLs to be scraped * @param outputFile File where scrape results will be written * @throws InterruptedException * @throws ExecutionException * @throws TimeoutException */public void scrapeURLs(String urlFile, String outputFile) throws InterruptedException, ExecutionException, TimeoutException { Iterator uiter = null; // Get the URLs from the file try { uiter = FileUtil.getURLIterator( urlFile ); } catch (IOException e1) { e1.printStackTrace(); } // Iterate through all URLs if (uiter != null) { // // Create an ExecutorService using a newFixedThreadPool // ExecutorService executorService = Executors.newFixedThreadPool(10); // // Create a map of Future and URLs // Map<Future, String> tasks = new LinkedHashMap<Future, String>(); // Iterator through all URLs for scraping the web while (uiter.hasNext()) { String urlstr = uiter.next(); // // Create a callable instance which calls the function that invokes the scraping for each URL // and get the content (full or part based on some rules) // Callable callable = new Callable() { public String call() throws Exception { return scrapeIndividualURL(urlstr); } }; // // Submit the task to executorService; At this point the scraping starts // Future future = executorService.submit(callable); tasks.put(future, urlstr); } // // For each task, iterate and get the content; Write the content to a file // tasks.forEach((future, url) -> { try { String content = future.get(120, TimeUnit.SECONDS); writeToFile(url, content, outputFile); } catch (InterruptedException | ExecutionException | TimeoutException e) { e.printStackTrace(); writeToFile(url, "Not Found", outputFile); } }); executorService.shutdown(); } } /** * Scrape the URL * @param urlstr * @return */ public static String scrapeIndividualURls( String urlstr ) { URL url = null; StringBuilder contentb = new StringBuilder(); try { // get URL content url = new URL(urlstr); // Create a URL Connection Object HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // Set the configuration parameters // Note the readTimeOut set to 30 seconds. // This is quite important when you are planning to scrape URLs. conn.setConnectTimeout(100000); conn.setReadTimeout(30000); conn.connect(); // open the stream and put it into BufferedReader InputStream in = null; if (conn.getResponseCode() >= 400) { in = conn.getErrorStream(); } else { BufferedReader br = new BufferedReader(new InputStreamReader( conn.getInputStream())); String inputLine; while ((inputLine = br.readLine()) != null) { contentb.append(inputLine); contentb.append("\n"); } br.close(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return contentb.toString(); } /** * Write to the file * @param url * @param value * @param outputFile */ private void writeToFile(String url, String value, String outputFile) throws IOException { FileWriter fw = new FileWriter( new File( outputFile ), true ); BufferedWriter bw = new BufferedWriter(fw); if( value != null ) { bw.write( url + "\t" + value + "\n" ); } else { bw.write( url + "\t" + "Not Found" + "\n" ); } bw.close(); }
Last updated: 25th Jan, 2025 Have you ever wondered how to seamlessly integrate the vast…
Hey there! As I venture into building agentic MEAN apps with LangChain.js, I wanted to…
Software-as-a-Service (SaaS) providers have long relied on traditional chatbot solutions like AWS Lex and Google…
Retrieval-Augmented Generation (RAG) is an innovative generative AI method that combines retrieval-based search with large…
The combination of Retrieval-Augmented Generation (RAG) and powerful language models enables the development of sophisticated…
Have you ever wondered how to use OpenAI APIs to create custom chatbots? With advancements…