<< 2008년 3월 23일 (일) | | 2008년 3월 25일 (화) >>

웹 페이지로부터 RDF META-DATA 생성 방법

일반 웹페이지에서 RDF Meta-Data를 추출할 수 있는 방법을 소개해 드립니다.
오픈소스인 WebCat을 사용하였습니다.  사용 방법은 아래와 같습니다.

1. WebCat 소스 다운로드
  - http://webcat.sourceforge.net/

2. languages(언어) 파일 아래 디렉토리에 저장
  - /k2/src/rdf/language-profiles
  - WebCat라이브러의 압축을 풀면 있음

3.소스
import pt.tumba.parser.HTMLParser;
import pt.tumba.parser.bib.BIB2HTML;
import pt.tumba.parser.doc.DOC2HTML;
import pt.tumba.parser.dvi.DVI2HTML;
import pt.tumba.parser.pdf.PDF2HTML;
import pt.tumba.parser.ppt.PPT2HTML;
import pt.tumba.parser.ps.PS2HTML;
import pt.tumba.parser.swf.SWF2HTML;
import pt.tumba.parser.tex.TEX2HTML;
import pt.tumba.parser.txt.TXT2HTML;
import pt.tumba.parser.unrtf.RTF2HTML;
import pt.tumba.parser.xls.XLS2HTML;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.util.Iterator;

public class RDFExtractor
{

public static String encodeSGML(String input)
{
char replaceChar[] = {'"', '&', '<', '>', '\\'};
String replaceStr[] =
{"&quot;", "&amp;", "&lt;", "&gt;", "&apos;"};
String saux = "";
int len = input.length();
char inputArr[] = new char[len];
input.getChars(0, len, inputArr, 0);
int i, j;
boolean replace;
for (i = 0; i < len; i++) {
replace = false;
for (j = 0; j < replaceChar.length && !replace; j++)
if (inputArr[i] == replaceChar[j]) {
saux += replaceStr[j];
replace = true;
}
if (!replace) saux += inputArr[i];
}
return saux;
}

/**
*
* @param Meta-Data parser
* @return Meta-Data의 RDF문자열 리턴
*/
public static String getRDFData(HTMLParser parser)
{
StringBuffer sbuf = new StringBuffer();
int pos[];
int i;
Iterator it1;
Iterator it2;
URL u;
String s;
String s2;
sbuf.append("<?xml version='1.0' ?>\n");
sbuf.append("<r:RDF xmlns:r='http://www.w3.org/
1999/02/22-rdf-syntax-ns#'\n");
sbuf.append("xmlns:d='http://purl.org/dc/elements/1.1/'\n");
sbuf.append("xmlns:s='http://www.w3.org/2000/01/rdf-schema#'\n");
sbuf.append("xmlns:h='http://www.w3.org/1999/xx/http#'\n");
sbuf.append("xmlns:t='http://purl.org/dc/terms/'>\n");

sbuf.append("<r:Description r:about=\"" +
encodeSGML(parser.getName()) + "\">\n");
if (!parser.getMetaData().getTitle().equals("")) {
sbuf.append("<d:Title>" +
encodeSGML(parser.getMetaData().getTitle()) +
"</d:Title>\n");
}
if (!parser.getMetaData().getDescription().equals("") ||
!parser.getMetaData().getKeywords().equals("")) {
sbuf.append("<t:abstract>\n");
sbuf.append("<r:Alt>\n");
sbuf.append("<r:li r:ID='DocumentAbstract'>" +
encodeSGML(parser.getMetaData().getDescription()) +
"</r:li>\n");
sbuf.append("<r:li r:ID='DocumentKeywords'>" +
encodeSGML(parser.getMetaData().getKeywords()) +
"</r:li>\n");
sbuf.append("</r:Alt>\n");
sbuf.append("</t:abstract>\n");
}
if (!parser.getMetaData().getAuthor().equals("") ||
!parser.getMetaData().getCreator().equals("")) {
sbuf.append("<d:Creator>\n");
sbuf.append("<r:Alt>\n");
if (!parser.getMetaData().getAuthor().equals(""))
sbuf.append("<r:li>" +
encodeSGML(parser.getMetaData().getAuthor()) +
"</r:li>\n");
if (!parser.getMetaData().getCreator().equals(""))
sbuf.append("<r:li>" +
encodeSGML(parser.getMetaData().getCreator()) +
"</r:li>\n");
sbuf.append("</r:Alt>\n");
sbuf.append("</d:Creator>\n");
}
if (!parser.getMetaData().getCopyright().equals("")) {
sbuf.append("<d:Rights>" +
encodeSGML(parser.getMetaData().getCopyright()) +
"</d:Rights>\n");
}
if (!parser.getLanguage().equals("")) {
sbuf.append("<d:Language>" +
encodeSGML(parser.getLanguage()) + "</d:Language>\n");
}
if (!parser.getMetaData().getMIMEType().equals("")) {
sbuf.append("<d:Type>" +
encodeSGML(parser.getMetaData().getMIMEType()) +
"</d:Type>\n");
}
if (!parser.getMetaData().getFormat().equals("")) {
sbuf.append("<d:Format>" +
encodeSGML(parser.getMetaData().getFormat()) +
"</d:Format>\n");
}
if (!parser.getMetaData().getPublisher().equals("")) {
sbuf.append("<d:Publisher>" +
encodeSGML(parser.getMetaData().getPublisher()) +
"</d:Publisher>\n");
}
if (!parser.getMetaData().getContributor().equals("")) {
sbuf.append("<d:Contributor>" +
encodeSGML(parser.getMetaData().getContributor()) +
"</d:Contributor>\n");
}
if (!parser.getMetaData().getSource().equals("")) {
sbuf.append("<d:Source>" +
encodeSGML(parser.getMetaData().getSource()) +
"</d:Source>\n");
}
if (!parser.getMetaData().getDate().equals("")) {
sbuf.append("<d:Date>" +
encodeSGML(parser.getMetaData().getDate()) +
"</d:Date>\n");
}

if (!parser.getMetaData().getDateCreated().equals("")) {
sbuf.append("<t:created>" +
encodeSGML(parser.getMetaData().getDateCreated()) +
"</t:created>\n");
}
if (!parser.getMetaData().getDateAvailable().equals("")) {
sbuf.append("<t:available>" +
encodeSGML(parser.getMetaData().getDateAvailable()) +
"</t:available>\n");
}

sbuf.append("<d:Relation>\n");
sbuf.append("<r:Alt>\n");
if (!parser.getMetaData().getRelation().equals(""))
sbuf.append("<r:li>" +
encodeSGML(parser.getMetaData().getRelation()) +
"</r:li>\n");
if (!parser.getMetaData().getLinkRSS().equals(""))
sbuf.append("<r:li r:ID='LinkRSS'>" +
encodeSGML(parser.getMetaData().getLinkRSS()) + "</r:li>\n");
sbuf.append("</r:Alt>\n");
sbuf.append("</d:Relation>\n");
if (!parser.getMetaData().getRelationIsPartOf().equals("")) {
sbuf.append("<t:isPartOf>" +
encodeSGML(parser.getMetaData().getRelationIsPartOf()) +
"</t:isPartOf>\n");
}
if (!parser.getMetaData().getRelationIsFormatOf().equals("")) {
sbuf.append("<t:isFormatOf>" +
encodeSGML(parser.getMetaData().getRelationIsFormatOf()) +
"</t:isFormatOf>\n");
}
if (!parser.getMetaData().getRelationIsVersionOf().equals("")) {
sbuf.append("<t:isVersionOf>" +
encodeSGML(parser.getMetaData().getRelationIsVersionOf()) +
"</t:isVersionOf>\n");
}
if (!parser.getMetaData().getRelationIsBasedOn().equals("")) {
sbuf.append("<t:isBasedOn>" +
encodeSGML(parser.getMetaData().getRelationIsBasedOn()) +
"</t:isBasedOn>\n");
}
if (!parser.getMetaData().getRelationRequires().equals("")) {
sbuf.append("<t:requires>" +
encodeSGML(parser.getMetaData().getRelationRequires()) +
"</t:requires>\n");
}
sbuf.append("</r:Description>\n");
sbuf.append("</r:RDF>");
return sbuf.toString();
}

/**
*
* @param src 소스 문서의 InputStream
* @param t 소스문서의 MIME TYPE
* @param aux 초기화 파서
*/
public static HTMLParser initParser(InputStream src,
String t, HTMLParser aux)
throws Exception {
int type = 0;
if (t.endsWith("application/pdf")) {
type = 1;
} else if (t.endsWith("application/postscript")) {
type = 2;
} else if (t.endsWith("application/msword")) {
type = 3;
} else if (t.endsWith("application/vnd.ms-powerpoint") ||
t.endsWith("application/powerpoint") ||
t.endsWith("application/mspowerpoint")) {
type = 4;
} else if (t.endsWith("application/x-latex") ||
t.endsWith("application/x-tex")) {
type = 5;
} else if (t.endsWith("application/x-bibtex")) {
type = 6;
} else if (t.endsWith("application/x-dvi")) {
type = 7;
} else if (t.endsWith("text/rtf") ||
t.endsWith("application/rtf") ||
t.endsWith("text/richtext")) {
type = 8;
} else if (t.endsWith("text/tab-separated-values") ||
t.endsWith("text/plain")) {
type = 9;
} else if (t.endsWith("application/excel") ||
t.endsWith("application/vnd.ms-excel") ||
t.endsWith("application/x-excel")) {
type = 10;
} else if (t.endsWith("application/x-shockwave-flash")) {
type = 11;
}
String s = null;
switch (type) {
case 1:
PDF2HTML pdf2html = new PDF2HTML();
s = pdf2html.convertPDFToHTML(src);
break;
case 2:
PS2HTML ps2html = new PS2HTML();
s = ps2html.convertPSToHTML(src);
break;
case 3:
DOC2HTML doc2html = new DOC2HTML();
s = doc2html.convertDOCToHTML(src);
break;
case 4:
PPT2HTML ppt2html = new PPT2HTML();
s = ppt2html.convertPPTToHTML(src);
break;
case 5:
TEX2HTML tex2html = new TEX2HTML();
s = tex2html.convertTEXToHTML(src);
break;
case 6:
BIB2HTML bib2html = new BIB2HTML();
s = bib2html.convertBIBToHTML(src);
break;
case 7:
//DVI2HTML dvi2html = new DVI2HTML();
DVI2HTML dvi2html = null;
s = dvi2html.convertDVIToHTML(src);
break;
case 8:
RTF2HTML rtf2html = new RTF2HTML();
s = rtf2html.convertRTFToHTML(src);
break;
case 9:
TXT2HTML txt2html = new TXT2HTML();
s = txt2html.convertTXTToHTML(src);
break;
case 10:
XLS2HTML xls2html = new XLS2HTML();
s = xls2html.convertXLSToHTML(src);
break;
case 11:
SWF2HTML swf2html = new SWF2HTML();
s = swf2html.convertSWFToHTML(src);
break;
default:
}
if (s == null) {
aux.initTokenizer(src);
} else {
aux.initTokenizer(
new ByteArrayInputStream(s.getBytes()));
}
return aux;
}


/**
*
* @param src 소스 문서의 InputStream
* @param aux 초기화 파서
*/
public static HTMLParser initParser(String src,
HTMLParser aux) throws Exception {
int type = 0;
if (src.endsWith(".pdf")) {
type = 1;
} else if (src.endsWith(".ps")) {
type = 2;
} else if (src.endsWith(".doc")) {
type = 3;
} else if (src.endsWith(".ppt")
|| src.endsWith(".pps")) {
type = 4;
} else if (src.endsWith(".tex")) {
type = 5;
} else if (src.endsWith(".bib")) {
type = 6;
} else if (src.endsWith(".dvi")) {
type = 7;
} else if (src.endsWith(".rtf")) {
type = 8;
} else if (src.endsWith(".txt")) {
type = 9;
} else if (src.endsWith(".xls")) {
type = 10;
} else if (src.endsWith(".swf")) {
type = 11;
}
if (src.startsWith("file:")) {
String s = null;
switch (type) {
case 1:
PDF2HTML pdf2html = new PDF2HTML();
s = pdf2html.convertPDFToHTML(
new File(src.substring(5)));
break;
case 2:
PS2HTML ps2html = new PS2HTML();
s = ps2html.convertPSToHTML(
new File(src.substring(5)));
break;
case 3:
DOC2HTML doc2html = new DOC2HTML();
s = doc2html.convertDOCToHTML(
new File(src.substring(5)));
break;
case 4:
PPT2HTML ppt2html = new PPT2HTML();
s = ppt2html.convertPPTToHTML(
new File(src.substring(5)));
break;
case 5:
TEX2HTML tex2html = new TEX2HTML();
s = tex2html.convertTEXToHTML(
new File(src.substring(5)));
break;
case 6:
BIB2HTML bib2html = new BIB2HTML();
s = bib2html.convertBIBToHTML(
new File(src.substring(5)));
break;
case 7:
// DVI2HTML dvi2html = new DVI2HTML();
DVI2HTML dvi2html = null;
s = dvi2html.convertDVIToHTML(
new File(src.substring(5)));
break;
case 8:
RTF2HTML rtf2html = new RTF2HTML();
s = rtf2html.convertRTFToHTML(
new File(src.substring(5)));
break;
case 9:
TXT2HTML txt2html = new TXT2HTML();
s = txt2html.convertTXTToHTML(
new File(src.substring(5)));
break;
case 10:
XLS2HTML xls2html = new XLS2HTML();
s = xls2html.convertXLSToHTML(
new File(src.substring(5)));
break;
case 11:
SWF2HTML swf2html = new SWF2HTML();
s = swf2html.convertSWFToHTML(
new File(src.substring(5)));
break;
default:
}
if (s == null) {
aux.initTokenizer(
new File(src.substring(5)));
} else {
aux.initTokenizer(
new ByteArrayInputStream(s.getBytes()));
}
} else {
String s = null;
switch (type) {
case 1:
PDF2HTML pdf2html = new PDF2HTML();
s = pdf2html.convertPDFToHTML(
new URL(src));
break;
case 2:
PS2HTML ps2html = new PS2HTML();
s = ps2html.convertPSToHTML(
new URL(src));
break;
case 3:
DOC2HTML doc2html = new DOC2HTML();
s = doc2html.convertDOCToHTML(
new URL(src));
break;
case 4:
PPT2HTML ppt2html = new PPT2HTML();
s = ppt2html.convertPPTToHTML(
new URL(src));
break;
case 5:
TEX2HTML tex2html = new TEX2HTML();
s = tex2html.convertTEXToHTML(
new URL(src));
break;
case 6:
BIB2HTML bib2html = new BIB2HTML();
s = bib2html.convertBIBToHTML(
new URL(src));
break;
case 7:
DVI2HTML dvi2html = new DVI2HTML();
s = dvi2html.convertDVIToHTML(
new URL(src));
break;
case 8:
RTF2HTML rtf2html = new RTF2HTML();
s = rtf2html.convertRTFToHTML(
new URL(src));
break;
case 9:
TXT2HTML txt2html = new TXT2HTML();
s = txt2html.convertTXTToHTML(
new URL(src));
break;
case 10:
XLS2HTML xls2html = new XLS2HTML();
s = xls2html.convertXLSToHTML(
new URL(src));
break;
case 11:
SWF2HTML swf2html = new SWF2HTML();
s = swf2html.convertSWFToHTML(
new URL(src));
break;
default:
}
if (s == null) {
aux.initTokenizer(new URL(src));
} else {
aux.initTokenizer(
new ByteArrayInputStream(s.getBytes()));
}
}
return aux;
}

public static void main(String args[]) throws Exception
{
String src;
HTMLParser hp;
src = "http://mimul.com/pebble/default/";
hp = new HTMLParser("/k2/src/rdf/language-profiles");
initParser(src, hp);
hp.processData();
System.out.println(getRDFData(hp));
}
}
4. 실행결과
<?xml version='1.0' ?>
<r:RDF xmlns:r='http://www.w3.org/1999/02/22-rdf-syntax-ns#'
xmlns:d='http://purl.org/dc/elements/1.1/'
xmlns:s='http://www.w3.org/2000/01/rdf-schema#'
xmlns:h='http://www.w3.org/1999/xx/http#'
xmlns:t='http://purl.org/dc/terms/'>
<r:Description r:about="Mimul's Developer World">
<d:Title>Mimul's Developer World</d:Title>
<t:abstract>
<r:Alt>
<r:li r:ID='DocumentAbstract'></r:li>
<r:li r:ID='DocumentKeywords'>
ajax,apple,attention,blog,business,eclipse,enterprise2.0
,feedburner,google,iphone,javascript,kms,lean
,marketing,mindset,openid,oracle,pebble,pojo
,semantic,sns,spring,subversion,tomcat,wiki,youtube
</r:li>
</r:Alt>
</t:abstract>
<d:Language>korean</d:Language>
<d:Type>text/html</d:Type>
<d:Relation>
<r:Alt>
</r:Alt>
</d:Relation>
</r:Description>
</r:RDF>