package com.test.parser; import java.net.URL; import java.util.Iterator; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class testJsoup { public static void main(String[] args) { // TODO Auto-generated method stub //online parser //http://try.jsoup.org/~LGB7rk_atM2roavV0d-czMt3J_g try{ Document doc = Jsoup.connect("http://en.wikipedia.org/").get(); Elements newsHeadlines = doc.select("#mp-itn b a"); // Get first table Element table = doc.select("table").first(); // Get td Iterator Iterator <Element> ite = table.select("td").iterator(); // Print content int cnt = 0; while (ite.hasNext()) { cnt++; System.out.println("Value " + cnt + ": " + ite.next().text()); } }catch(Exception e){ e.printStackTrace(); } } }
廣告
2015年5月28日 星期四
[java] html parser use jsoup lib
complete project https://github.com/cihm/JavaHtmlParser
2015年5月26日 星期二
[linux] crontab 命令裡面 % 的意思是斷行
今天設計了新的java參數讓cronjob跑, 發現cronjob沒有再跑, 後來上網查了發現 是 % 這個符號的關係, 這符號的意思是換行,所以這符號後面的參數我的jar檔就吃不到啦~ 引用 A <percent-sign> character in this field shall be translated to a <newline<. Any character preceded by a <backslash> (including the '%' ) shall cause that character to be treated literally. Only the first line (up to a '%' or end-of-line) of the command field shall be executed by the command interpreter. The other lines shall be made available to the command as standard input.
2015年5月24日 星期日
[gradle] build gradle project without install gralde on linux
添加如下訊息到你的 builde.gradle中 task wrapper(type: Wrapper) { gradleVersion = '2.3' }
執行gradle wrapper
在你linux環境的該專案下執行指令 ./gradlew ./gradlew buildif you want install you can link to
http://exponential.io/blog/2015/03/30/install-gradle-on-ubuntu-linux/
[gradle] build jar
添加如下訊息到你的 builde.gradle中 jar { baseName = 'solrGradleTest' version = '0.1.0' } 預設的產出的jar檔會在專案路徑的build中 EX: C:\Users\1409035\Documents\GitHub\SolrJava\build\libs\solrGradleTest-0.1.0.jar
2015年5月21日 星期四
[gson] read / write large file use gson stream
in order to handle with read/write "big" file,
so use gson stream
read
write
read and write
so use gson stream
read
File jsonInputFile = new File(path); FileInputStream fisInput; JsonReader reader; try { fisInput = new FileInputStream(jsonInputFile); reader = new JsonReader(new InputStreamReader(fisInput, "UTF-8")); ArrayList<Map> categoryALHM = gson.fromJson(reader, ArrayList.class); if(null == categoryALHM || categoryALHM.size() == 0){ log.info(path+":is null or empty"); return false; } reader.close(); fisInput.close(); } catch (Exception e) { log.info(e.getMessage().toString()); return false; }
write
File jsonFile = new File(originalDatpath); FileOutputStream fos; JsonWriter writer; try { fos = new FileOutputStream(jsonFile); writer = new JsonWriter(new OutputStreamWriter(fos, "UTF-8")); writer.setIndent(" "); Map getCategoryMap = JsonTool.Json2Map(IbobarDataUtil .viewCategory()); ArrayList<HashMap<String, Object>> getCategoryMapList = (ArrayList) getCategoryMap .get("list"); log.info("original category:" + getCategoryMapList); gson.toJson(getCategoryMapList, ArrayList.class, writer); writer.close(); fos.flush(); fos.close(); } catch (Exception e) { log.info(e.getMessage().toString()); }
read and write
File jsonInputFile = new File(organizeDataPath); File jsonOutputFile = new File(usingDatPath); FileOutputStream fosOutput; FileInputStream fisInput; JsonWriter writer; JsonReader reader; try { fosOutput = new FileOutputStream(jsonOutputFile); writer = new JsonWriter(new OutputStreamWriter(fosOutput, "UTF-8")); writer.setIndent(" "); fisInput = new FileInputStream(jsonInputFile); reader = new JsonReader(new InputStreamReader(fisInput, "UTF-8")); writer.beginObject(); reader.beginObject(); while (reader.hasNext()) { String key = reader.nextName(); List<Map> bookOranizeList = gson.fromJson(reader, ArrayList.class); writer.name(key); gson.toJson(bookOranizeList, ArrayList.class, writer); } writer.endObject(); writer.close(); fosOutput.flush(); fosOutput.close(); reader.endObject(); reader.close(); fisInput.close(); } catch (Exception e) { log.info(e.getMessage().toString()); }
2015年5月20日 星期三
[gradle] install nexus sever on ubuntu
install nexus sever on ubuntu 下載 sudo wget http://www.sonatype.org/downloads/nexus-2.1.1.war 改名 mv nexus-2.1.1.war nexus.war 複製到tomcat底下 cp nexus.war /opt/apache-tomcat-7.0.61/webapps/ 於瀏覽器輸入如下網址 http://192.168.22.148:8080/nexus 預設的帳密 admin/admin123 錯誤經驗: 裝好後沒設定任何config. 此時sonatype-work以及設定檔等等就會產在路徑root/底下 設定相關config vim /opt/apache-tomcat-7.0.61/webapps/nexus/WEB-INF/plexus.properties 修改如下內容 nexus-work=/opt/nexus-repo (你可以自訂你想設定的路徑) 測試上傳的lib有沒有到上述所設定的路徑 (用LeoLib為例子) 於網站http://192.168.22.148:8080/nexus登入後 選取 Repositories -> Release -> Artifact(人工) Upload -> GAV Definition 選取 GAV Parameters -> GAV Parameters 設定 ->Group:LeoLib,Artifact:LeoLib,version:1.0 於以下路徑檢查有無上傳成功 /opt/nexus-repo/storage/releases/ gradle.build裡面的設定與nexus 與 Artifact Upload的關係 比如 GAV Parameters 設定 ->Group:LeoLib,Artifact:LeoLib,version:1.0 你的gradle.build裡面的dependencies設定就會如下 compile group:'LeoLib', name:'LeoLib', version:'1.0' 查詢maven上面的lib的 group,name,version. 訊息 http://mvnrepository.com/ security.xml要登入後才會產出來
2015年5月10日 星期日
[searchEngine] [java] Build solr index & search it
this complete project can download from my github
https://github.com/cihm/GradleAndSolr
SolrSearch.java
SolrBuildIndex.java
https://github.com/cihm/GradleAndSolr
SolrSearch.java
package com.job; import java.net.MalformedURLException; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import com.vo.SolrArgVo; public class SolrSearch { public static Logger log = Logger.getLogger(SolrSearch.class.getName()); public boolean solrSearch(SolrArgVo solrArgVo) throws MalformedURLException, SolrServerException{ //HttpSolrServer solr = new HttpSolrServer("http://10.24.100.237:8080/solr/collection1"); HttpSolrServer solr = new HttpSolrServer("http://192.168.22.148:8080/solr/collection1"); SolrQuery query = new SolrQuery(); //英文是精確比對,中文是模糊比對 //查詢條件 query.setQuery( "*:*" ); //query.setFilterQueries("name:" + "登基", "description:" + "登基", "channel:" + "ylib"); String keyword = solrArgVo.getKetWord(); String channel = solrArgVo.getChanneCode(); query.addFilterQuery("channel:" + channel); query.setQuery("name:"+keyword+" OR description:"+keyword); //query.setQuery("name:"+"王道 AND currency:"+"NTD"); //can use to be get book by cat query.setRows(100); //get row of query result//default is 10 //query.setQuery("title:國王"); //query.setQuery("title: art"); QueryResponse response = solr.query(query); SolrDocumentList results = response.getResults(); System.out.println("NumFound="+results.getNumFound()); System.out.println("SIZE="+results.size()); //System.out.println(results); //System.out.println(results.get(0).get("title")); for (int i = 0; i < results.size(); ++i) { System.out.println("result "+i+"= "+results.get(i).get("id")); System.out.println("result "+i+"="+ results.get(i)); } return true; } }
SolrBuildIndex.java
package com.job; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.BinaryRequestWriter; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.common.SolrInputDocument; import com.util.DataUtil; import com.util.SolrConstants; public class SolrBuildIndex { public static Logger log = Logger.getLogger(SolrBuildIndex.class.getName()); public boolean solrBuildIndex() throws MalformedURLException, SolrServerException,IOException{ HttpSolrServer server = new HttpSolrServer("http://192.168.22.148:8080/solr/collection1"); // 清空之前建立的索引數據 // delete all doc server.deleteByQuery( "*:*" ); //提升性能 server.setRequestWriter(new BinaryRequestWriter()); String datFilePath = "C:/Users/1409035/Desktop/FTP_server_backup/candelete/"; //String datFilePath = PropertyLoader.getInstance().getValue(DAT_FILE_PATH_KEY); Map zinioMap= (HashMap) DataUtil.DeSerialization(datFilePath + "Zinio" + "-" + "getItemListByCategoryCode" + "-using" + ".dat"); Map ylibMap =(HashMap) DataUtil.DeSerialization(datFilePath + "Ylib" + "-" + "getItemListByCategoryCode" + "-using" + ".dat"); Map ibobarMap =(HashMap) DataUtil.DeSerialization(datFilePath + "Ibobar" + "-" + "getItemListByCategoryCode" + "-using" + ".dat"); Map linkingMap =(HashMap) DataUtil.DeSerialization(datFilePath + "Linking" + "-" + "getItemListByCategoryCode" + "-using" + ".dat"); Collectiondocs2 = new ArrayList (); int k=0; Iterator linkingIter = linkingMap.entrySet().iterator(); while (linkingIter.hasNext()) { Map.Entry entry = (Map.Entry) linkingIter.next(); ArrayList > bookAL = (ArrayList >) entry .getValue(); for (HashMap hm : bookAL) { k++; SolrInputDocument doc = new SolrInputDocument(); doc.addField("id", k); doc.addField("channel", "Linking"); doc.addField("name", hm.get("name")); doc.addField("description", hm.get("description")); docs2.add(doc); } } Iterator ylibIter = ylibMap.entrySet().iterator(); while (ylibIter.hasNext()) { Map.Entry entry = (Map.Entry) ylibIter.next(); ArrayList > bookAL = (ArrayList >) entry .getValue(); for (HashMap hm : bookAL) { k++; SolrInputDocument doc = new SolrInputDocument(); doc.addField("id", k); doc.addField("channel", "Ylib"); doc.addField("name", hm.get("name")); doc.addField("description", hm.get("description")); System.out.println(doc.toString()); docs2.add(doc); } } Iterator zinioIter = zinioMap.entrySet().iterator(); while(zinioIter.hasNext()) { Map.Entry entry = (Map.Entry) zinioIter.next(); //下述不能用ALHM 去接,否則會拋出ArrayList can't be cast to ALHM的錯誤 ArrayList > bookAL=(ArrayList >)entry.getValue(); for(HashMap hm:bookAL){ k++; System.out.println(entry.getKey()+" "+hm.get("Title")); SolrInputDocument doc = new SolrInputDocument(); doc.addField("id", k); doc.addField("channel", "Zinio"); doc.addField("name", hm.get("name")); doc.addField("description", hm.get("description")); docs2.add(doc); } } Iterator ibobarIter = ibobarMap.entrySet().iterator(); while(ibobarIter.hasNext()) { Map.Entry entry = (Map.Entry) ibobarIter.next(); ArrayList > bookAL=(ArrayList >)entry.getValue(); for(HashMap hm:bookAL){ k++; SolrInputDocument doc = new SolrInputDocument(); doc.addField("id", k); doc.addField("channel", "Ibobar"); doc.addField("name", hm.get("name")); doc.addField("description", hm.get("description")); docs2.add(doc); } } //將ArrayList轉為XML格式 //String resultList=GeneralXmlPullParser.reverse(contentAL); System.out.println("======="); //System.out.println(docs2.toString()); server.add(docs2); server.commit(); server.optimize(true, true); System.out.println("finish"); return true; } }
[gradle] install gradle & use if to build project
window環境下 安裝 . 下載gradle (gradle-2.3-all.zip) . 解壓縮到某路徑 (C:\Users\1409035\gradle-2.3) . 將bin檔路徑加到環境變數path裡面. . 測試有無成功=>command line打上 gradle -v ========================================== 第一個專案 . 創建一個目錄結構如下的專案 ├── build.gradle └── src └── main ├── java │ └── tw │ └── com │ └── handler │ └── SoleHandler.java └── resources └── log4j.properties . SoleHandler.java 程式如下: package com.handler; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class SolrHandler { static Log logger = LogFactory.getLog(SolrHandler.class); public static void main(String[] args) { logger.info("Hello World"); } } . log4j.properties程式如下: log4j.rootLogger =Info , A1, A2 #inly # A1 is set to be a ConsoleAppender log4j.appender.A1 = org.apache.log4j.ConsoleAppender log4j.appender.A1.layout = org.apache.log4j.PatternLayout log4j.appender.A1.layout.ConversionPattern = [solrJava.log.%d{yyyyMMdd-HH:mm}][%p][%C-%L] %m%n # A2 is set to be a file # produce log record document every day #log4j.appender.A2 = org.apache.log4j.FileAppender log4j.appender.A2 = org.apache.log4j.RollingFileAppender log4j.appender.A2.layout = org.apache.log4j.PatternLayout log4j.appender.A2.layout.ConversionPattern = [%d{yyyyMMdd-HH:mm}][%p][%C-%L] %m%n log4j.appender.A2.File = /opt/app/logs/solrJava.log log4j.appender.A2.DatePattern = '.'yyyyMMdd-HH:mm log4j.appender.A2.MaxFileSize=10MB . build.gradle程式如下: /* 引用 java plugin 獲得編譯 java 專案相關的 task $ */ apply plugin: 'java' /* 引用 application plugin 獲得執行 java 專案相關的 task $ */ apply plugin:'application' /* 執行 application plugin 用到的參數 $ */ mainClassName = "com.handler.SolrHandler" /* 設定 maven repository server $ */ repositories { mavenCentral() } /* 宣告專案的相依函式庫 $ */ dependencies { compile group: 'commons-logging', name: 'commons-logging', version: '1.1.1' compile group: 'log4j', name: 'log4j', version: '1.2.16' } . 使用 gradle 指令執行 run task 結果如下: C:\Users\1409035\Documents\GitHub\SolrJava> gradle run :compileJava UP-TO-DATE :processResources UP-TO-DATE :classes UP-TO-DATE :run log4j:WARN No such property [datePattern] in org.apache.log4j.RollingFileAppender. [solrJava.log.20150504-20:25][INFO][com.handler.SolrHandler-11] Hello World BUILD SUCCESSFUL . build後folder會產生一些東西: C:. ├─.gradle │ └─2.3 │ └─taskArtifacts ├─build │ ├─classes │ │ └─main │ │ └─com │ │ └─handler │ ├─dependency-cache │ ├─resources │ │ └─main │ └─tmp │ └─compileJava └─src └─main ├─java │ └─com │ └─handler └─resources
訂閱:
文章 (Atom)