/** * Method to parse HTML source using JTidy. * Pls download JTidy jar file from its website. * @param is - Input Stream of the HTML Source */ private static void parseHtmlByTidy(InputStream is) { Tidy tidy = new Tidy(); // tidy will print whatever it parses to System.out Document doc = tidy.parseDOM(is, System.out); // Try to read
tags NodeList nodeList = doc.getElementsByTagName("form"); // Iterate thro all form tags. if (nodeList != null) { int iCount = nodeList.getLength(); for (int i = 0; i < iCount; i++) { Node node = nodeList.item(i); System.out.println( "node.getLocalName()=" + node.getLocalName()); System.out.println("node.getNodeName()=" + node.getNodeName()); System.out.println( "node.getNodeValue()=" + node.getNodeValue()); NamedNodeMap map = node.getAttributes(); System.out.println( "map.getNamedItem(name)=" + map.getNamedItem("name").getNodeName()); } } } private static void parseHtmlBySwing(InputStream is) throws IOException, BadLocationException { EditorKit ekit = new HTMLEditorKit(); javax.swing.text.Document doc = ekit.createDefaultDocument(); doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); ekit.read(is, doc, 0); javax.swing.text.ElementIterator it = new javax.swing.text.ElementIterator(doc); javax.swing.text.Element elem; int count = 0; while ((elem = it.next()) != null) { javax.swing.text.AttributeSet s = (javax.swing.text.AttributeSet) elem .getAttributes() .getAttribute( HTML.Tag.INPUT); System.out.println(s + "count=" + count + elem.getName()); if (s != null) { count++; System.out.println( "Link No " + count + "\t" + s.getAttribute(HTML.Attribute.NAME) + "\t" + s.getAttribute(HTML.Attribute.VALUE)); } } // end of while } private static Map parseHtmlByTagSoup(InputStream is) throws SAXException, IOException, ParserConfigurationException { System.setProperty( "javax.xml.parsers.SAXParserFactory", "org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl"); SAXParserFactory spf = SAXParserFactory.newInstance(); System.out.println("Ok, SAX factory JAXP creates is: " + spf); System.out.println("Let's parse..."); SimpleDefaultHandler sdh = new SimpleDefaultHandler(); spf.newSAXParser().parse(is, sdh); return sdh.getMap(); }