﻿{"id":166,"date":"2017-11-03T00:00:00","date_gmt":"2017-11-02T16:00:00","guid":{"rendered":""},"modified":"2018-12-14T10:45:50","modified_gmt":"2018-12-14T02:45:50","slug":"the-web-as-a-parallel-corpus","status":"publish","type":"post","link":"http:\/\/www.nlpir.org\/wordpress\/2017\/11\/03\/the-web-as-a-parallel-corpus\/","title":{"rendered":"The Web as a Parallel Corpus"},"content":{"rendered":"<p><P>The Web as a Parallel Corpus<BR>Philip Resnik&#8727; Noah A. Smith&#8224;<BR>University of Maryland Johns Hopkins University<BR>Parallel corpora have become an essential resource for work in multilingual natural language<BR>processing. In this article, we report on our work using the STRAND system for mining parallel<BR>text on theWorldWideWeb, first reviewing the original algorithm and results and then presenting<BR>a set of significant enhancements. These enhancements include the use of supervised learning<BR>based on structural features of documents to improve classification performance, a new contentbased<BR>measure of translational equivalence, and adaptation of the system to take advantage of the<BR>Internet Archive for mining parallel text from theWeb on a large scale. Finally, the value of these<BR>techniques is demonstrated in the construction of a significant parallel corpus for a low-density<BR>language pair.<\/P><br \/>\n<P><U><FONT color=#810081>Philip Resnik\uff0cNoah A. Smith2007 Computational Linguistics<\/FONT><\/U><\/P><br \/>\n<P><U><FONT color=#810081>&nbsp;<A href=\"http:\/\/www.nlpir.org\/wordpress\/attachments\/2011\/05\/The Web as a Parallel Corpus.pdf\" target=_blank><IMG border=0 src=\"http:\/\/www.nlpir.org\/images\/base\/attachment.gif\"> The Web as a Parallel Corpus.pdf(430 KB)<\/A><\/FONT><\/U><\/P><\/p>\n","protected":false},"excerpt":{"rendered":"<p>The Web as a Parallel CorpusPhilip Resni &hellip; <a href=\"http:\/\/www.nlpir.org\/wordpress\/2017\/11\/03\/the-web-as-a-parallel-corpus\/\">\u7ee7\u7eed\u9605\u8bfb <span class=\"meta-nav\">&rarr;<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[31],"tags":[],"_links":{"self":[{"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/posts\/166"}],"collection":[{"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/comments?post=166"}],"version-history":[{"count":1,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/posts\/166\/revisions"}],"predecessor-version":[{"id":1441,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/posts\/166\/revisions\/1441"}],"wp:attachment":[{"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/media?parent=166"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/categories?post=166"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.nlpir.org\/wordpress\/wp-json\/wp\/v2\/tags?post=166"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}