Warning, /office/kbibtex-testset/bib/in-the-wild/ir.bib is written in an unsupported language. File is not indexed.
0001 0002 0003 @inproceedings{domingos00unified, 0004 author = {Pedro Domingos}, 0005 title = {A Unified Bias-Variance Decomposition for Zero-One and Squared Loss}, 0006 booktitle = {Proc. National Conference on Artificial Intelligence and Proc. Conference Innovative Applications of Artificial Intelligence}, 0007 year = {2000}, 0008 isbn = {0-262-51112-6}, 0009 pages = {564--569}, 0010 publisher = {AAAI Press / The MIT Press}, 0011 } 0012 0013 @article{turtle95query, 0014 author = {Howard Turtle and James Flood}, 0015 title = {Query evaluation: strategies and optimizations}, 0016 journal = {IP\&M}, 0017 volume = {31}, 0018 number = {6}, 0019 year = {1995}, 0020 issn = {0306-4573}, 0021 pages = {831--850}, 0022 doi = {dx.doi.org/10.1016/0306-4573(95)00020-H}, 0023 publisher = {Pergamon Press}, 0024 address = {Tarrytown, NY, USA}, 0025 } 0026 0027 @inproceedings{aberer01pgrid, 0028 author = {Karl Aberer}, 0029 title = {{P-Grid}: A Self-Organizing Access Structure for {P2P} Information Systems}, 0030 booktitle = {Proc. International Conference on Cooperative Information Systems}, 0031 year = {2001}, 0032 isbn = {3-540-42524-1}, 0033 pages = {179--194}, 0034 publisher = {Springer}, 0035 address = {London, UK}, 0036 } 0037 0038 @incollection{ callan00distributed, 0039 author = "Jamie Callan", 0040 title = "Distributed information retrieval", 0041 editor = {W. Bruce Croft}, 0042 booktitle = {Advances in information retrieval}, 0043 pages = {127-150}, 0044 publisher = {Kluwer}, 0045 year = 2000 0046 } 0047 0048 @inproceedings{robertson04simple, 0049 author = {Stephen Robertson and Hugo Zaragoza and Michael Taylor}, 0050 title = {Simple {BM25} extension to multiple weighted fields}, 0051 booktitle = {Proc. CIKM}, 0052 year = {2004}, 0053 isbn = {1-58113-874-1}, 0054 pages = {42--49}, 0055 doi = {doi.acm.org/10.1145/1031171.1031181}, 0056 } 0057 % location = {Washington, D.C., USA}, 0058 % publisher = {ACM Press}, 0059 % address = {New York, NY, USA}, 0060 0061 0062 @article{smeulders00contentbased, 0063 author = {Arnold W. M. Smeulders and Marcel Worring and Simone Santini and Amarnath Gupta and Ramesh Jain}, 0064 title = {Content-Based Image Retrieval at the End of the Early Years}, 0065 journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, 0066 volume = {22}, 0067 number = {12}, 0068 year = {2000}, 0069 issn = {0162-8828}, 0070 pages = {1349--1380}, 0071 doi = {dx.doi.org/10.1109/34.895972}, 0072 publisher = {IEEE Computer Society}, 0073 address = {Washington, DC, USA}, 0074 } 0075 0076 @inproceedings{tombros98advantages, 0077 author = {Anastasios Tombros and Mark Sanderson}, 0078 title = {Advantages of query biased summaries in information retrieval}, 0079 booktitle = {Proc. SIGIR}, 0080 year = {1998}, 0081 isbn = {1-58113-015-5}, 0082 pages = {2--10}, 0083 location = {Melbourne, Australia}, 0084 doi = {doi.acm.org/10.1145/290941.290947}, 0085 publisher = {ACM Press}, 0086 address = {New York, NY, USA}, 0087 } 0088 0089 @techreport{oard96survey, 0090 author = {Douglas W. Oard and Bonnie J. Dorr}, 0091 title = {A survey of multilingual text retrieval}, 0092 year = {1996}, 0093 institution = {Institute for Advanced Computer Studies, University of Maryland}, 0094 number = {UMIACS-TR-96-19}, 0095 address = {College Park, MD, USA}, 0096 } 0097 0098 @book{ingwersen05turn, 0099 author = {Peter Ingwersen and Kalervo J\"{a}rvelin}, 0100 title = {The Turn: Integration of Information Seeking and Retrieval in Context}, 0101 year = {2005}, 0102 isbn = {140203850X}, 0103 publisher = {Springer}, 0104 address = {Secaucus, NJ, USA}, 0105 } 0106 0107 @article{lewis96natural, 0108 author = {David D. Lewis and Karen Sp\"{a}rck Jones}, 0109 title = {Natural language processing for information retrieval}, 0110 journal = {CACM}, 0111 volume = {39}, 0112 number = {1}, 0113 year = {1996}, 0114 issn = {0001-0782}, 0115 pages = {92--101}, 0116 doi = {doi.acm.org/10.1145/234173.234210}, 0117 publisher = {ACM Press}, 0118 address = {New York, NY, USA}, 0119 } 0120 0121 @book{meadow99text, 0122 author = {Charles T. Meadow and Donald H. Kraft and Bert R. Boyce}, 0123 title = {Text Information Retrieval Systems}, 0124 year = {1999}, 0125 isbn = {0124874053}, 0126 publisher = {Academic Press}, 0127 address = {Orlando, FL, USA}, 0128 } 0129 0130 0131 0132 @Article{ creecy92trading, 0133 author = {Robert H. Creecy and Brij M. Masand and Stephen J. Smith 0134 and David L. Waltz}, 0135 title = {Trading {MIPS} and memory for knowledge engineering}, 0136 publisher = {ACM Press}, 0137 journal = {CACM}, 0138 volume = {35}, 0139 number = {8}, 0140 year = {1992}, 0141 issn = {0001-0782}, 0142 pages = {48--64}, 0143 doi = {doi.acm.org/10.1145/135226.135228}, 0144 address = {New York, NY}, 0145 } 0146 0147 @PhDThesis{ sornil01parallel, 0148 author = {Ohm Sornil}, 0149 title = {Parallel Inverted Index for Large-Scale, Dynamic Digital 0150 Libraries}, 0151 year = {2001}, 0152 school = {Virginia Tech}, 0153 url = {scholar.lib.vt.edu/theses/available/etd-02062001-114915/} 0154 , 0155 } 0156 0157 @InCollection{ harman92inverted, 0158 author = {Donna Harman and Ricardo Baeza-Yates and Edward Fox and W. 0159 Lee}, 0160 title = {Inverted files}, 0161 year = {1992}, 0162 pages = {28--43}, 0163 crossref = {frakes92information} 0164 } 0165 0166 @Book{ garciamolina99database, 0167 author = {Hector Garcia-Molina and Jennifer Widom and Jeffrey D. 0168 Ullman}, 0169 title = {Database System Implementation}, 0170 year = {1999}, 0171 isbn = {0130402648}, 0172 publisher = {Prentice Hall}, 0173 address = {Upper Saddle River, NJ, USA}, 0174 } 0175 0176 @InProceedings{ altingovde07largescale, 0177 author = {Ismail Seng{\"o}r Alting{\"o}vde and Rifat Ozcan and 0178 Huseyin Cagdas Ocalan and Fazli Can and {\"O}zg{\"u}r Ulusoy}, 0179 title = {Large-scale cluster-based retrieval experiments on {T}urkish 0180 texts}, 0181 booktitle = {Proc. SIGIR}, 0182 publisher = {ACM Press}, 0183 year = {2007}, 0184 pages = {891-892}, 0185 ee = {doi.acm.org/10.1145/1277741.1277961}, 0186 bibsource = {DBLP, http://dblp.uni-trier.de}, 0187 } 0188 0189 @InProceedings{ eyheramendy03naive, 0190 author = {Susana Eyheramendy and David Lewis and David Madigan}, 0191 title = {On the {Naive {Bayes}} Model for Text Categorization}, 0192 booktitle = {International Workshop on Artificial Intelligence and Statistics}, 0193 year = 2003, 0194 publisher = {Society for Artificial Intelligence and Statistics}, 0195 } 0196 0197 @InProceedings{ cacheda03optimization, 0198 author = {Fidel Cacheda and Victor Carneiro and Carmen Guerrero and 0199 {\'A}ngel Vi{\~n}a}, 0200 title = {Optimization of Restricted Searches in Web Directories 0201 Using Hybrid Data Structures}, 0202 booktitle = {Proc. ECIR}, 0203 year = {2003}, 0204 pages = {436-451}, 0205 ee = {link.springer.de/link/service/series/0558/bibs/2633/26330436.htm} 0206 , 0207 bibsource = {DBLP, http://dblp.uni-trier.de}, 0208 } 0209 0210 @InProceedings{ pelleg99accelerating, 0211 author = {Dan Pelleg and Andrew Moore}, 0212 title = {Accelerating exact k-means algorithms with geometric 0213 reasoning}, 0214 booktitle = {Proc. KDD}, 0215 publisher = {ACM Press}, 0216 0217 year = {1999}, 0218 isbn = {1-58113-143-7}, 0219 pages = {277--281}, 0220 location = {San Diego, CA}, 0221 doi = {doi.acm.org/10.1145/312129.312248}, 0222 address = {New York, NY}, 0223 } 0224 0225 @InProceedings{ bradley98refining, 0226 author = {Paul S. Bradley and Usama M. Fayyad}, 0227 title = {Refining Initial Points for {K}-Means Clustering}, 0228 booktitle = {Proc. ICML}, 0229 year = {1998}, 0230 pages = {91-99}, 0231 bibsource = {DBLP, http://dblp.uni-trier.de}, 0232 } 0233 0234 @Article{ can04efficiency, 0235 author = {Fazli Can and Ismail Seng\"{o}r Alting{\"o}vde and Engin 0236 Demir}, 0237 title = {Efficiency and effectiveness of query processing in 0238 cluster-based retrieval}, 0239 journal = {Information Systems}, 0240 volume = {29}, 0241 number = {8}, 0242 year = {2004}, 0243 issn = {0306-4379}, 0244 pages = {697--717}, 0245 doi = {dx.doi.org/10.1016/S0306-4379(03)00062-0}, 0246 publisher = {Elsevier Science}, 0247 address = {Oxford, UK, UK}, 0248 } 0249 0250 @InProceedings{ davidson03speeding, 0251 author = {Ian Davidson and Ashwin Satyanarayana}, 0252 title = {Speeding up k-means Clustering by Bootstrap Averaging}, 0253 booktitle = {ICDM 2003 Workshop on Clustering Large Data Sets}, 0254 year = {2003}, 0255 } 0256 0257 @InProceedings{ rosenzvi04authortopic, 0258 author = {Michal Rosen-Zvi and Thomas Griffiths and Mark Steyvers 0259 and Padhraic Smyth}, 0260 title = {The author-topic model for authors and documents}, 0261 booktitle = {Proc. UAI}, 0262 year = {2004}, 0263 isbn = {0-9749039-0-6}, 0264 pages = {487--494}, 0265 } 0266 % publisher = {AUAI Press}, 0267 % address = {Arlington, Virginia, United States}, 0268 % location = {Banff}, 0269 0270 @InProceedings{ trotman06xmlir, 0271 author = {Andrew Trotman and Nils Pharo and Miro Lehtonen}, 0272 title = {{XML}-{IR} Users and Use Cases}, 0273 booktitle = {Proc. INEX}, 0274 year = {2006}, 0275 pages = {400-412}, 0276 ee = {dx.doi.org/10.1007/978-3-540-73888-6_38}, 0277 bibsource = {DBLP, http://dblp.uni-trier.de}, 0278 } 0279 0280 @InProceedings{ betsi06user, 0281 author = {Stamatina Betsi and Mounia Lalmas and Anastasios Tombros 0282 and Theodora Tsikrika}, 0283 title = {User expectations from {XML} element retrieval}, 0284 booktitle = {Proc. SIGIR}, 0285 publisher = {ACM Press}, 0286 year = {2006}, 0287 pages = {611-612}, 0288 ee = {doi.acm.org/10.1145/1148170.1148280}, 0289 bibsource = {DBLP, http://dblp.uni-trier.de}, 0290 } 0291 0292 @InProceedings{ woodley06nlpx, 0293 author = {Alan Woodley and Shlomo Geva}, 0294 title = {{NLPX} at {INEX} 2006}, 0295 booktitle = {Proc. INEX}, 0296 year = {2006}, 0297 pages = {302-311}, 0298 ee = {dx.doi.org/10.1007/978-3-540-73888-6_30}, 0299 bibsource = {DBLP, http://dblp.uni-trier.de}, 0300 } 0301 0302 @InProceedings{ yang01thresholding, 0303 author = {Yiming Yang}, 0304 title = {A study of thresholding strategies for text 0305 categorization}, 0306 booktitle = {Proc. SIGIR}, 0307 publisher = {ACM Press}, 0308 0309 year = {2001}, 0310 isbn = {1-58113-331-6}, 0311 pages = {137--145}, 0312 location = {New Orleans, LA}, 0313 doi = {doi.acm.org/10.1145/383952.383975}, 0314 address = {New York, N}, 0315 } 0316 0317 @InProceedings{ buckley95optimization, 0318 author = {Chris Buckley and Gerard Salton}, 0319 title = {Optimization of relevance feedback weights}, 0320 booktitle = {Proc. SIGIR}, 0321 publisher = {ACM Press}, 0322 year = {1995}, 0323 isbn = {0-89791-714-6}, 0324 pages = {351--357}, 0325 location = {Seattle, Washington, United States}, 0326 doi = {doi.acm.org/10.1145/215206.215383}, 0327 address = {New York, NY}, 0328 } 0329 0330 @Article{ ault02information, 0331 author = {Thomas Galen Ault and Yiming Yang}, 0332 title = {Information Filtering in {TREC-9} and {TDT-3}: {A} Comparative 0333 Analysis}, 0334 journal = {IR}, 0335 volume = {5}, 0336 number = {2-3}, 0337 year = {2002}, 0338 issn = {1386-4564}, 0339 pages = {159--187}, 0340 publisher = {Kluwer}, 0341 address = {Hingham, MA, USA}, 0342 } 0343 0344 @InProceedings{ yang03marginbased, 0345 author = {Yiming Yang and Bryan Kisiel}, 0346 title = {Margin-based local regression for adaptive filtering}, 0347 booktitle = {Proc. CIKM}, 0348 year = {2003}, 0349 isbn = {1-58113-723-0}, 0350 pages = {191--198}, 0351 doi = {doi.acm.org/10.1145/956863.956902}, 0352 } 0353 % location = {New Orleans, LA, USA}, 0354 % publisher = {ACM Press}, 0355 % address = {New York, NY}, 0356 0357 0358 @InProceedings{ moschitti03optimal, 0359 author = {Alessandro Moschitti}, 0360 title = {A Study on Optimal Parameter Tuning for {R}occhio Text 0361 Classifier}, 0362 booktitle = {Proc. ECIR}, 0363 year = {2003}, 0364 pages = {420-435}, 0365 ee = {link.springer.de/link/service/series/0558/bibs/2633/26330420.htm} 0366 , 0367 bibsource = {DBLP, http://dblp.uni-trier.de}, 0368 } 0369 0370 @TechReport{ bennett00assessing, 0371 author = "Paul N. Bennett", 0372 title = "Assessing the calibration of naive {Bayes}' posterior 0373 estimates", 0374 number = {CMU-CS-00-155}, 0375 institution = {School of Computer Science, Carnegie Mellon University}, 0376 year = "2000", 0377 } 0378 0379 @InProceedings{ turpin07fast, 0380 author = {Andrew Turpin and Yohannes Tsegay and David Hawking and 0381 Hugh E. Williams}, 0382 title = {Fast generation of result snippets in web search}, 0383 booktitle = {Proc. SIGIR}, 0384 publisher = {ACM Press}, 0385 year = {2007}, 0386 pages = {127-134}, 0387 ee = {doi.acm.org/10.1145/1277741.1277766}, 0388 bibsource = {DBLP, http://dblp.uni-trier.de}, 0389 } 0390 0391 @InProceedings{ ntoulas07pruning, 0392 author = {Alexandros Ntoulas and Junghoo Cho}, 0393 title = {Pruning policies for two-tiered inverted index with 0394 correctness guarantee}, 0395 booktitle = {Proc. SIGIR}, 0396 publisher = {ACM Press}, 0397 year = {2007}, 0398 pages = {191-198}, 0399 ee = {doi.acm.org/10.1145/1277741.1277776}, 0400 bibsource = {DBLP, http://dblp.uni-trier.de}, 0401 } 0402 0403 @InProceedings{ zhang07performance, 0404 author = {Jiangong Zhang and Xiaohui Long and Torsten Suel}, 0405 title = {Performance of Compressed Inverted List Caching in Search 0406 Engines}, 0407 booktitle = {Proc. CIKM}, 0408 year = {2007}, 0409 } 0410 % publisher = {ACM Press}, 0411 0412 0413 @InProceedings{ silvestri04assigning, 0414 author = {Fabrizio Silvestri and Raffaele Perego and Salvatore 0415 Orlando}, 0416 title = {Assigning document identifiers to enhance compressibility 0417 of Web Search Engines indexes}, 0418 booktitle = {Proc. ACM Symposium on Applied Computing}, 0419 year = {2004}, 0420 pages = {600-605}, 0421 ee = {doi.acm.org/10.1145/967900.968024}, 0422 bibsource = {DBLP, http://dblp.uni-trier.de}, 0423 } 0424 0425 @InProceedings{ blandford02index, 0426 author = {Dan Blandford and Guy Blelloch}, 0427 title = {Index Compression through Document Reordering}, 0428 booktitle = {Proc. Data Compression Conference}, 0429 year = {2002}, 0430 pages = {342}, 0431 publisher = {IEEE Computer Society}, 0432 address = {Washington, DC, USA}, 0433 } 0434 0435 @Article{ blanco06tsp, 0436 author = {Roi Blanco and Alvaro Barreiro}, 0437 title = {{TSP} and cluster-based solutions to the reassignment of 0438 document identifiers}, 0439 journal = {IR}, 0440 volume = {9}, 0441 number = {4}, 0442 year = {2006}, 0443 pages = {499-517}, 0444 ee = {dx.doi.org/10.1007/s10791-006-6614-y}, 0445 bibsource = {DBLP, http://dblp.uni-trier.de}, 0446 } 0447 0448 @InProceedings{ silvestri07sorting, 0449 author = {Fabrizio Silvestri}, 0450 title = {Sorting Out the Document Identifier Assignment Problem}, 0451 booktitle = {Proc. ECIR}, 0452 year = {2007}, 0453 pages = {101-112}, 0454 ee = {dx.doi.org/10.1007/978-3-540-71496-5_12}, 0455 bibsource = {DBLP, http://dblp.uni-trier.de}, 0456 } 0457 0458 @InProceedings{ moffat96exploiting, 0459 author = {Alistair Moffat and Lang Stuiver}, 0460 title = {Exploiting clustering in inverted file compression}, 0461 booktitle = {Proc. Conference on Data Compression}, 0462 year = {1996}, 0463 isbn = {0-8186-7358-3}, 0464 pages = {82--91}, 0465 publisher = {IEEE Computer Society}, 0466 address = {Washington, DC, USA}, 0467 } 0468 0469 @InProceedings{ moschitti04complex, 0470 author = {Alessandro Moschitti and Roberto Basili}, 0471 title = {Complex Linguistic Features for Text Classification: {A} 0472 Comprehensive Study}, 0473 booktitle = {Proc. ECIR}, 0474 year = {2004}, 0475 pages = {181-196}, 0476 bibsource = {DBLP, http://dblp.uni-trier.de}, 0477 } 0478 0479 @InProceedings{ rennie03tackling, 0480 author = {Jason D. Rennie and Lawrence Shih and Jaime Teevan and 0481 David R. Karger}, 0482 title = {Tackling the Poor Assumptions of Naive {Bayes} Text 0483 Classifiers}, 0484 booktitle = {Proc. ICML}, 0485 year = {2003}, 0486 pages = {616-623}, 0487 bibsource = {DBLP, http://dblp.uni-trier.de}, 0488 } 0489 0490 @Article{ can90concepts, 0491 author = {Fazli Can and Esen A. Ozkarahan}, 0492 title = {Concepts and Effectiveness of the Cover-Coefficient-Based 0493 Clustering Methodology for Text Databases}, 0494 journal = {ACM Trans. Database Syst.}, 0495 volume = {15}, 0496 number = {4}, 0497 year = {1990}, 0498 pages = {483-517}, 0499 ee = {doi.acm.org/10.1145/99935.99938, 0500 db/journals/tods/CanO90.html}, 0501 bibsource = {DBLP, http://dblp.uni-trier.de}, 0502 } 0503 0504 @Book{ anderberg73cluster, 0505 author = {Michael R. Anderberg}, 0506 title = {Cluster analysis for applications}, 0507 address = {New York}, 0508 publisher = {Academic Press}, 0509 year = 1973 0510 } 0511 % used to have "Probability and Mathematical Statistics, " in the publisher??? 0512 0513 @TechReport{ fox91fastinv, 0514 author = {Edward A. Fox and Whay C. Lee}, 0515 title = {{FAST-INV}: {A} Fast Algorithm for building large inverted 0516 files}, 0517 year = {1991}, 0518 source = {www.ncstrl.org:8900/ncstrl/servlet/search?formname=detail\&id=oai%3Ancstrlh%3Avatech_cs%3Ancstrl.vatech_cs%2F%2FTR-91-10} 0519 , 0520 institution = {Virginia Polytechnic Institute \& State University}, 0521 address = {Blacksburg, VA, USA}, 0522 } 0523 0524 @InProceedings{ ng01spectral, 0525 author = {Andrew Y. Ng and Michael I. Jordan and Yair Weiss}, 0526 title = {On Spectral Clustering: {A}nalysis and an algorithm}, 0527 booktitle = {Proc. NIPS}, 0528 year = {2001}, 0529 pages = {849-856}, 0530 ee = {www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA35.ps.gz} 0531 , 0532 bibsource = {DBLP, http://dblp.uni-trier.de}, 0533 } 0534 0535 @InProceedings{ kannan00clusterings, 0536 author = {Ravi Kannan and Santosh Vempala and Adrian Vetta}, 0537 title = {On clusterings -- {G}ood, bad and spectral}, 0538 booktitle = {Proc. Symposium on Foundations of Computer Science}, 0539 year = {2000}, 0540 isbn = {0-7695-0850-2}, 0541 pages = {367--377}, 0542 publisher = {IEEE Computer Society}, 0543 address = {Washington, DC, USA}, 0544 } 0545 0546 @Article{ boley98principal, 0547 author = {Daniel Boley}, 0548 title = {Principal Direction Divisive Partitioning}, 0549 journal = {Data Mining and Knowledge Discovery}, 0550 volume = {2}, 0551 number = {4}, 0552 year = {1998}, 0553 issn = {1384-5810}, 0554 pages = {325--344}, 0555 doi = {dx.doi.org/10.1023/A:1009740529316}, 0556 publisher = {Kluwer}, 0557 address = {Hingham, MA, USA}, 0558 } 0559 0560 @InProceedings{ tishby00data, 0561 author = {Naftali Tishby and Noam Slonim}, 0562 title = {Data Clustering by {M}arkovian Relaxation and the 0563 Information Bottleneck Method}, 0564 booktitle = {Proc. NIPS}, 0565 year = {2000}, 0566 pages = {640-646}, 0567 bibsource = {DBLP, http://dblp.uni-trier.de}, 0568 } 0569 0570 @InProceedings{ zha01bipartite, 0571 author = {Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming 0572 Gu and Horst D. Simon}, 0573 title = {Bipartite Graph Partitioning and Data Clustering}, 0574 booktitle = {Proc. CIKM}, 0575 year = {2001}, 0576 pages = {25-32}, 0577 bibsource = {DBLP, http://dblp.uni-trier.de}, 0578 } 0579 % publisher = {ACM Press}, 0580 0581 @InProceedings{ dhillon01coclustering, 0582 author = {Inderjit S. Dhillon}, 0583 title = {Co-clustering documents and words using bipartite spectral 0584 graph partitioning}, 0585 booktitle = {Proc. KDD}, 0586 year = {2001}, 0587 pages = {269-274}, 0588 ee = {portal.acm.org/citation.cfm?id=502512.502550}, 0589 bibsource = {DBLP, http://dblp.uni-trier.de}, 0590 } 0591 0592 @InProceedings{ hearst93subtopic, 0593 author = {Marti A. Hearst and Christian Plaunt}, 0594 title = {Subtopic structuring for full-length document access}, 0595 booktitle = {Proc. SIGIR}, 0596 publisher = {ACM Press}, 0597 0598 year = {1993}, 0599 isbn = {0-89791-605-0}, 0600 pages = {59--68}, 0601 location = {Pittsburgh, Pennsylvania, United States}, 0602 doi = {doi.acm.org/10.1145/160688.160695}, 0603 address = {New York, NY}, 0604 } 0605 0606 @InCollection{ berkhin06survey, 0607 author = {Pavel Berkhin}, 0608 title = {A survey of clustering data mining techniques}, 0609 booktitle = {Grouping Multidimensional Data: {R}ecent Advances in Clustering}, 0610 editor = {Jacob Kogan and Charles Nicholas and Marc Teboulle}, 0611 year = {2006}, 0612 pages = {25--71}, 0613 publisher = {Springer}, 0614 } 0615 0616 @InProceedings{ mihajlovic05score, 0617 author = {Vojkan Mihajlovi\'{c} and Henk Ernst Blok and Djoerd 0618 Hiemstra and Peter M. G. Apers}, 0619 title = {Score region algebra: {B}uilding a transparent {XML-R} 0620 database}, 0621 booktitle = {Proc. CIKM}, 0622 year = {2005}, 0623 isbn = {1-59593-140-6}, 0624 pages = {12--19}, 0625 doi = {doi.acm.org/10.1145/1099554.1099560}, 0626 } 0627 % publisher = {ACM Press}, 0628 % location = {Bremen}, 0629 % address = {New York, NY}, 0630 0631 @TechReport{ chiaramella96model, 0632 author = {Yves Chiaramella and Philippe Mulhem and Franck Fourel}, 0633 title = {A Model for Multimedia Information Retrieval}, 0634 year = {1996}, 0635 institution = {University of Glasgow}, 0636 number = {4-96}, 0637 } 0638 0639 @InProceedings{ forman04pitfall, 0640 author = {George Forman}, 0641 title = {A pitfall and solution in multi-class feature selection 0642 for text classification}, 0643 booktitle = {Proc. ICML}, 0644 year = {2004}, 0645 ee = {doi.acm.org/10.1145/1015330.1015356}, 0646 bibsource = {DBLP, http://dblp.uni-trier.de}, 0647 } 0648 0649 @Article{ tsochantaridis05large, 0650 author = {Ioannis Tsochantaridis and Thorsten Joachims and Thomas 0651 Hofmann and Yasemin Altun}, 0652 title = {Large Margin Methods for Structured and Interdependent 0653 Output Variables}, 0654 journal = {JMLR}, 0655 volume = {6}, 0656 year = {2005}, 0657 pages = {1453-1484}, 0658 ee = {www.jmlr.org/papers/v6/tsochantaridis05a.html}, 0659 bibsource = {DBLP, http://dblp.uni-trier.de}, 0660 } 0661 0662 @InProceedings{ riezler07statistical, 0663 author = {Riezler, Stefan and Vasserman, Alexander and 0664 Tsochantaridis, Ioannis and Mittal, Vibhu and Liu, Yi}, 0665 title = {Statistical Machine Translation for Query Expansion in 0666 Answer Retrieval}, 0667 booktitle = {Proc. ACL}, 0668 month = {June}, 0669 year = {2007}, 0670 address = {Prague, Czech Republic}, 0671 publisher = {Association for Computational Linguistics}, 0672 pages = {464--471}, 0673 url = {www.aclweb.org/anthology/P/P07/P07-1059}, 0674 } 0675 0676 @Book{ cohen95empirical, 0677 title = {Empirical methods for artificial intelligence}, 0678 address = {Cambridge, MA, USA}, 0679 author = {Paul R. Cohen}, 0680 publisher = {MIT Press}, 0681 year = {1995}, 0682 } 0683 0684 @InProceedings{ chucarroll06semantic, 0685 author = {Jennifer Chu-Carroll and John Prager and Krzysztof Czuba 0686 and David Ferrucci and Pablo Duboue}, 0687 title = {Semantic search via {XML} fragments: {A} high-precision 0688 approach to {IR}}, 0689 booktitle = {Proc. SIGIR}, 0690 publisher = {ACM Press}, 0691 year = {2006}, 0692 isbn = {1-59593-369-7}, 0693 pages = {445--452}, 0694 location = {Seattle, Washington, USA}, 0695 doi = {doi.acm.org/10.1145/1148170.1148247}, 0696 address = {New York, NY}, 0697 } 0698 0699 @InProceedings{ arvola05generalized, 0700 author = {Paavo Arvola and Marko Junkkari and Jaana 0701 Kek{\"a}l{\"a}inen}, 0702 title = {Generalized contextualization method for {XML} information 0703 retrieval}, 0704 booktitle = {Proc. CIKM}, 0705 year = {2005}, 0706 pages = {20-27}, 0707 ee = {doi.acm.org/10.1145/1099554.1099561}, 0708 bibsource = {DBLP, http://dblp.uni-trier.de}, 0709 } 0710 % publisher = {ACM Press}, 0711 0712 @InProceedings{ sigurbjornsson04mixture, 0713 author = {B{\"o}rkur Sigurbj{\"o}rnsson and Jaap Kamps and Maarten 0714 de Rijke}, 0715 title = {Mixture Models, Overlap, and Structural Hints in {XML} 0716 Element Retrieval}, 0717 booktitle = {Proc. INEX}, 0718 year = {2004}, 0719 pages = {196-210}, 0720 ee = {dx.doi.org/10.1007/11424550_16}, 0721 bibsource = {DBLP, http://dblp.uni-trier.de}, 0722 } 0723 0724 @InProceedings{ vittaut06machine, 0725 author = {Jean-No{\"e}l Vittaut and Patrick Gallinari}, 0726 title = {Machine Learning Ranking for Structured Information 0727 Retrieval}, 0728 booktitle = {Proc. ECIR}, 0729 year = {2006}, 0730 pages = {338-349}, 0731 ee = {dx.doi.org/10.1007/11735106_30}, 0732 bibsource = {DBLP, http://dblp.uni-trier.de}, 0733 } 0734 0735 @Article{ lalmas07evaluating, 0736 author = {Mounia Lalmas and Anastasios Tombros}, 0737 title = {Evaluating {XML} retrieval effectiveness at {INEX}}, 0738 publisher = {ACM Press}, 0739 journal = {SIGIR Forum}, 0740 volume = {41}, 0741 number = {1}, 0742 year = {2007}, 0743 issn = {0163-5840}, 0744 pages = {40--57}, 0745 doi = {doi.acm.org/10.1145/1273221.1273225}, 0746 address = {New York, NY}, 0747 } 0748 0749 @Article{ chaudhuri06probabilistic, 0750 author = {Surajit Chaudhuri and Gautam Das and Vagelis Hristidis and 0751 Gerhard Weikum}, 0752 title = {Probabilistic information retrieval approach for ranking of database query results}, 0753 publisher = {ACM Press}, 0754 journal = {ACM Transactions on Database Systems}, 0755 volume = {31}, 0756 number = {3}, 0757 year = {2006}, 0758 issn = {0362-5915}, 0759 pages = {1134--1168}, 0760 doi = {doi.acm.org/10.1145/1166074.1166085}, 0761 address = {New York, NY}, 0762 } 0763 0764 @InProceedings{ cohen98integration, 0765 author = {William W. Cohen}, 0766 title = {Integration of Heterogeneous Databases Without Common 0767 Domains Using Queries Based on Textual Similarity}, 0768 booktitle = {Proc. SIGMOD}, 0769 publisher = {ACM Press}, 0770 year = {1998}, 0771 isbn = {0-89791-955-5}, 0772 pages = {201-212}, 0773 bibsource = {DBLP, http://dblp.uni-trier.de}, 0774 } 0775 0776 % editor = {Laura M. Haas and Ashutosh Tiwary}, 0777 0778 @Article{ navarro97proximal, 0779 author = {Gonzalo Navarro and Ricardo Baeza-Yates}, 0780 title = {Proximal nodes: {A} model to query document databases by 0781 content and structure}, 0782 publisher = {ACM Press}, 0783 journal = {TOIS}, 0784 volume = {15}, 0785 number = {4}, 0786 year = {1997}, 0787 issn = {1046-8188}, 0788 pages = {400--435}, 0789 doi = {doi.acm.org/10.1145/263479.263482}, 0790 address = {New York, NY}, 0791 } 0792 0793 @Article{ fuhr97probabilistic, 0794 author = {Norbert Fuhr and Thomas R{\"o}lleke}, 0795 title = {A probabilistic relational algebra for the integration of 0796 information retrieval and database systems}, 0797 publisher = {ACM Press}, 0798 journal = {TOIS}, 0799 volume = {15}, 0800 number = {1}, 0801 year = {1997}, 0802 issn = {1046-8188}, 0803 pages = {32--66}, 0804 doi = {doi.acm.org/10.1145/239041.239045}, 0805 address = {New York, NY}, 0806 } 0807 0808 @Article{ ameryahia05report, 0809 author = {Sihem Amer-Yahia and Pat Case and Thomas R{\"o}lleke and 0810 Jayavel Shanmugasundaram and Gerhard Weikum}, 0811 title = {Report on the {DB/IR} panel at {SIGMOD} 2005}, 0812 publisher = {ACM Press}, 0813 journal = {SIGMOD Record}, 0814 volume = {34}, 0815 number = {4}, 0816 year = {2005}, 0817 issn = {0163-5808}, 0818 pages = {71--74}, 0819 doi = {doi.acm.org/10.1145/1107499.1107514}, 0820 address = {New York, NY}, 0821 } 0822 0823 @article{theobald08efficient, 0824 author = {Martin Theobald and 0825 Holger Bast and 0826 Debapriyo Majumdar and 0827 Ralf Schenkel and 0828 Gerhard Weikum}, 0829 title = {Top{X}: {E}fficient and versatile top-{\it k} query processing 0830 for semistructured data}, 0831 journal = {VLDB Journal}, 0832 volume = {17}, 0833 number = {1}, 0834 year = {2008}, 0835 pages = {81-115}, 0836 ee = {dx.doi.org/10.1007/s00778-007-0072-z}, 0837 } 0838 0839 @Article{ ameryahia06xquery, 0840 author = {Sihem Amer-Yahia and Chavdar Botev and Jochen D{\"o}rre 0841 and Jayavel Shanmugasundaram}, 0842 title = {{XQuery} Full-Text extensions explained}, 0843 journal = {IBM Systems Journal}, 0844 volume = 45, 0845 number = 2, 0846 pages = {335--352}, 0847 year = 2006, 0848 } 0849 0850 @InProceedings{ zavrel00information, 0851 author = "Jakub Zavrel and Peter Berck and Willem Lavrijssen", 0852 title = "Information Extraction by Text Classification: {C}orpus 0853 Mining for Features.", 0854 booktitle = {Workshop Information Extraction Meets Corpus Linguistics}, 0855 month = "May 30th", 0856 year = "2000", 0857 address = "Athens, Greece", 0858 note = "Held in conjunction with LREC-2000", 0859 url = "www.cnts.ua.ac.be/Publications/2000/ZBL00", 0860 } 0861 0862 @Article{ zobel95efficient, 0863 author = {Justin Zobel and Alistair Moffat and Ross Wilkinson and 0864 Ron Sacks-Davis}, 0865 title = {Efficient retrieval of partial documents}, 0866 journal = {IP\&M}, 0867 volume = {31}, 0868 number = {3}, 0869 year = {1995}, 0870 issn = {0306-4573}, 0871 pages = {361--377}, 0872 doi = {dx.doi.org/10.1016/0306-4573(94)00052-5}, 0873 publisher = {Pergamon Press}, 0874 address = {Tarrytown, NY}, 0875 } 0876 0877 @InProceedings{ salton93passage, 0878 author = {Gerard Salton and James Allan and Chris Buckley}, 0879 title = {Approaches to passage retrieval in full text information 0880 systems}, 0881 booktitle = {Proc. SIGIR}, 0882 publisher = {ACM Press}, 0883 year = {1993}, 0884 isbn = {0-89791-605-0}, 0885 pages = {49--58}, 0886 location = {Pittsburgh, Pennsylvania, United States}, 0887 doi = {doi.acm.org/10.1145/160688.160693}, 0888 address = {New York, NY}, 0889 } 0890 0891 @InProceedings{ kaszkiel97passage, 0892 author = {Marcin Kaszkiel and Justin Zobel}, 0893 title = {Passage retrieval revisited}, 0894 booktitle = {Proc. SIGIR}, 0895 publisher = {ACM Press}, 0896 year = {1997}, 0897 isbn = {0-89791-836-3}, 0898 pages = {178--185}, 0899 location = {Philadelphia, Pennsylvania, United States}, 0900 doi = {doi.acm.org/10.1145/258525.258561}, 0901 address = {New York, NY}, 0902 } 0903 0904 @Article{ hearst97texttiling, 0905 author = {Marti A. Hearst}, 0906 title = {{TextTiling}: {S}egmenting Text into Multi-paragraph Subtopic 0907 Passages.}, 0908 journal = {Computational Linguistics}, 0909 volume = {23}, 0910 number = {1}, 0911 year = {1997}, 0912 pages = {33-64}, 0913 bibsource = {DBLP, http://dblp.uni-trier.de}, 0914 } 0915 0916 @InProceedings{ tan07using, 0917 author = {Songbo Tan and Xueqi Cheng}, 0918 title = {Using hypothesis margin to boost centroid text 0919 classifier}, 0920 booktitle = {Proc. ACM Symposium on Applied Computing}, 0921 publisher = {ACM Press}, 0922 year = {2007}, 0923 isbn = {1-59593-480-4}, 0924 pages = {398--403}, 0925 location = {Seoul, Korea}, 0926 doi = {doi.acm.org/10.1145/1244002.1244096}, 0927 address = {New York, NY}, 0928 } 0929 0930 @InProceedings{ han00centroidbased, 0931 author = {Eui-Hong Han and George Karypis}, 0932 title = {Centroid-Based Document Classification: {A}nalysis and 0933 Experimental Results}, 0934 booktitle = {Proc. PKDD}, 0935 year = {2000}, 0936 pages = {424-431}, 0937 ee = {link.springer.de/link/service/series/0558/bibs/1910/19100424.htm} 0938 , 0939 bibsource = {DBLP, http://dblp.uni-trier.de}, 0940 } 0941 0942 @InProceedings{ joachims97probabilistic, 0943 author = {Thorsten Joachims}, 0944 title = {A Probabilistic Analysis of the {R}occhio Algorithm with 0945 TFIDF for Text Categorization}, 0946 booktitle = {Proc. ICML}, 0947 year = {1997}, 0948 isbn = {1-55860-486-3}, 0949 pages = {143--151}, 0950 publisher = {Morgan Kaufmann}, 0951 address = {San Francisco, CA}, 0952 } 0953 0954 @InProceedings{ allan98online, 0955 author = {James Allan and Ron Papka and Victor Lavrenko}, 0956 title = {On-line new event detection and tracking}, 0957 booktitle = {Proc. SIGIR}, 0958 publisher = {ACM Press}, 0959 year = {1998}, 0960 isbn = {1-58113-015-5}, 0961 pages = {37--45}, 0962 location = {Melbourne, Australia}, 0963 doi = {doi.acm.org/10.1145/290941.290954}, 0964 address = {New York, NY}, 0965 } 0966 0967 @InProceedings{ trotman06passage, 0968 author = {Andrew Trotman and Shlomo Geva}, 0969 title = {Passage Retrieval and Other {XML}-Retrieval Tasks}, 0970 booktitle = {SIGIR 2006 Workshop on {XML} Element Retrieval Methodology}, 0971 pages = {43--50}, 0972 year = 2006, 0973 } 0974 0975 @TechReport{ somogyi90melbourne, 0976 author = {Zoltan Somogyi}, 0977 title = {The {M}elbourne {U}niversity bibliography system}, 0978 year = {1990}, 0979 institution = {Melbourne University}, 0980 address = {Parkville, Victoria, Australia}, 0981 number = {90/3}, 0982 } 0983 0984 @Article{ lesk88grab, 0985 author = {Michael Lesk}, 0986 title = {Grab -- {I}nverted indexes with low storage overhead}, 0987 year = 1988, 0988 journal = {Computing Systems}, 0989 volume = {1}, 0990 pages = {207--220}, 0991 } 0992 0993 @InProceedings{ joachims06training, 0994 author = {Thorsten Joachims}, 0995 title = {Training linear {SVMs} in linear time}, 0996 booktitle = {Proc. KDD}, 0997 publisher = {ACM Press}, 0998 year = {2006}, 0999 isbn = {1-59593-339-5}, 1000 pages = {217--226}, 1001 location = {Philadelphia, PA, USA}, 1002 doi = {doi.acm.org/10.1145/1150402.1150429}, 1003 address = {New York, NY}, 1004 } 1005 1006 @Article{ perkins03grafting, 1007 author = {Simon Perkins and Kevin Lacker and James Theiler}, 1008 title = {Grafting: {F}ast, incremental feature selection by gradient 1009 descent in function space}, 1010 journal = {JMLR}, 1011 volume = {3}, 1012 year = {2003}, 1013 issn = {1533-7928}, 1014 pages = {1333--1356}, 1015 publisher = {MIT Press}, 1016 address = {Cambridge, MA}, 1017 } 1018 1019 @InProceedings{ fraenkel85novel, 1020 author = {Aviezri S. Fraenkel and Shmuel T. Klein}, 1021 title = {Novel Compression of sparse Bit-Strings -- Preliminary 1022 Report}, 1023 booktitle = {Combinatorial Algorithms on Words, NATO ASI Series Vol F12}, 1024 publisher = {Springer}, 1025 address = {Berlin}, 1026 year = 1985, 1027 pages = { 169--183}, 1028 } 1029 1030 @InProceedings{ moffat92parameterised, 1031 author = {Alistair Moffat and Justin Zobel}, 1032 title = {Parameterised compression for sparse bitmaps}, 1033 booktitle = {Proc. SIGIR}, 1034 publisher = {ACM Press}, 1035 year = {1992}, 1036 isbn = {0-89791-523-2}, 1037 pages = {274--285}, 1038 location = {Copenhagen, Denmark}, 1039 doi = {doi.acm.org/10.1145/133160.133210}, 1040 address = {New York, NY}, 1041 } 1042 1043 @Book{ pirolli07information, 1044 title = {Information Foraging Theory: {A}daptive Interaction With 1045 Information}, 1046 author = {Peter L. T. Pirolli}, 1047 publisher = {Oxford University Press}, 1048 year = 2007, 1049 } 1050 1051 @Book{ langville, 1052 title = {Google's {PageRank} and Beyond: {T}he Science of Search Engine 1053 Rankings}, 1054 author = {Amy Langville and Carl Meyer}, 1055 publisher = {Princeton University Press}, 1056 year = 2006, 1057 } 1058 1059 @Article{ fraley98how, 1060 author = {Chris Fraley and Adrian E. Raftery}, 1061 title = {How Many Clusters? {W}hich Clustering Method? {A}nswers Via 1062 Model-Based Cluster Analysis}, 1063 journal = {Computer Journal}, 1064 volume = {41}, 1065 number = {8}, 1066 year = {1998}, 1067 pages = {578-588}, 1068 bibsource = {DBLP, http://dblp.uni-trier.de}, 1069 } 1070 1071 @InProceedings{ iwayama95clusterbased, 1072 author = {Makoto Iwayama and Takenobu Tokunaga}, 1073 title = {Cluster-Based Text Categorization: {A} Comparison of 1074 Category Search Strategies}, 1075 booktitle = {Proc. SIGIR}, 1076 publisher = {ACM Press}, 1077 year = {1995}, 1078 isbn = {0-89791-714-6}, 1079 pages = {273-280}, 1080 ee = {db/conf/sigir/IwayamaT95.html}, 1081 bibsource = {DBLP, http://dblp.uni-trier.de}, 1082 } 1083 1084 @Article{ bartell98optimizing, 1085 author = {Brian T. Bartell and Garrison W. Cottrell and Richard K. 1086 Belew}, 1087 title = {Optimizing similarity using multi-query relevance 1088 feedback}, 1089 journal = {JASIS}, 1090 volume = {49}, 1091 number = {8}, 1092 year = {1998}, 1093 issn = {0002-8231}, 1094 pages = {742--761}, 1095 publisher = {John Wiley \& Sons}, 1096 address = {New York, NY}, 1097 } 1098 1099 @Article{ dhillon01concept, 1100 author = {Inderjit S. Dhillon and Dharmendra S. Modha}, 1101 title = {Concept decompositions for large sparse text data using 1102 clustering}, 1103 journal = {Machine Learning}, 1104 volume = {42}, 1105 number = {1/2}, 1106 year = {2001}, 1107 issn = {0885-6125}, 1108 pages = {143--175}, 1109 doi = {dx.doi.org/10.1023/A:1007612920971}, 1110 publisher = {Kluwer}, 1111 address = {Hingham, MA}, 1112 } 1113 1114 @InProceedings{ grabs02xml, 1115 title = {Generating Vector Spaces On-the-fly for Flexible {XML} 1116 Retrieval}, 1117 author = {Torsten Grabs and Hans-J{\"o}rg Schek}, 1118 booktitle = {{XML} and Information Retrieval Workshop at SIGIR 2002}, 1119 year = {2002}, 1120 } 1121 1122 @Article{ schlieder02querying, 1123 author = {Torsten Schlieder and Holger Meuss}, 1124 title = {Querying and ranking {XML} documents}, 1125 journal = {JASIST}, 1126 volume = {53}, 1127 number = {6}, 1128 year = {2002}, 1129 issn = {1532-2882}, 1130 pages = {489--503}, 1131 doi = {dx.doi.org/10.1002/asi.10060}, 1132 publisher = {John Wiley \& Sons}, 1133 address = {New York, NY}, 1134 } 1135 1136 @InProceedings{ tannier05xml, 1137 author = {Xavier Tannier and Shlomo Geva}, 1138 title = {{XML} Retrieval with a Natural Language Interface}, 1139 booktitle = {Proc. SPIRE}, 1140 year = {2005}, 1141 pages = {29-40}, 1142 ee = {dx.doi.org/10.1007/11575832_4}, 1143 bibsource = {DBLP, http://dblp.uni-trier.de}, 1144 } 1145 1146 @InProceedings{ zwol06bricks, 1147 author = {Roelof {van~Zwol} and Jeroen Baas and Herre van Oostendorp 1148 and Frans Wiering}, 1149 title = {Bricks: {T}he Building Blocks to Tackle Query Formulation in 1150 Structured Document Retrieval}, 1151 booktitle = {Proc. ECIR}, 1152 year = {2006}, 1153 pages = {314-325}, 1154 ee = {dx.doi.org/10.1007/11735106_28}, 1155 bibsource = {DBLP, http://dblp.uni-trier.de}, 1156 } 1157 1158 @Article{ fuhr04xirql, 1159 author = {Norbert Fuhr and Kai Gro{\ss}johann}, 1160 title = {{XIRQL}: {A}n {XML} query language based on information 1161 retrieval concepts}, 1162 journal = {TOIS}, 1163 volume = {22}, 1164 number = {2}, 1165 year = {2004}, 1166 pages = {313-356}, 1167 url = {doi.acm.org/10.1145/984321.984326}, 1168 bibsource = {DBLP, http://dblp.uni-trier.de}, 1169 } 1170 1171 @InProceedings{ stein03cluster, 1172 author = {Benno Stein and Sven Meyer zu Eissen and Frank 1173 Wi{\ss}brock}, 1174 title = {On Cluster Validity and the Information Need of Users}, 1175 booktitle = {Proc. Artificial Intelligence and Applications}, 1176 year = 2003, 1177 } 1178 1179 @InProceedings{ stein04topic, 1180 author = {Benno Stein and Sven Meyer zu Eissen}, 1181 title = {Topic Identification: {F}ramework and Application}, 1182 booktitle = {Proc. International Conference on Knowledge Management}, 1183 year = 2004, 1184 } 1185 1186 @PhDThesis{ bartell94optimizing, 1187 author = {Brian Theodore Bartell}, 1188 title = {Optimizing ranking functions: {A} connectionist approach to 1189 adaptive information retrieval}, 1190 year = {1994}, 1191 order_no = {UMI Order No. GAX94-14751}, 1192 school = {University of California at San Diego}, 1193 address = {La Jolla, CA}, 1194 } 1195 1196 @Book{ grossman04information, 1197 title = {Information Retrieval: {A}lgorithms and Heuristics}, 1198 author = {David A. Grossman and Ophir Frieder}, 1199 edition = {2nd}, 1200 publisher = {Springer}, 1201 year = 2004, 1202 } 1203 1204 @InProceedings{ lu07cisr, 1205 author = {Wei Lu and Stephen E. Robertson and Andrew MacFarlane}, 1206 title = {{CISR} at {INEX} 2006}, 1207 booktitle = {Proc. INEX}, 1208 year = {2007}, 1209 pages = {57-63}, 1210 crossref = {fuhr07comparative}, 1211 } 1212 1213 @InProceedings{ kamps04length, 1214 author = {Jaap Kamps and Maarten de Rijke and B{\"o}rkur 1215 Sigurbj{\"o}rnsson}, 1216 title = {Length normalization in {XML} retrieval}, 1217 booktitle = {Proc. SIGIR}, 1218 publisher = {ACM Press}, 1219 year = {2004}, 1220 isbn = {1-58113-881-4}, 1221 pages = {80--87}, 1222 location = {Sheffield, United Kingdom}, 1223 doi = {doi.acm.org/10.1145/1008992.1009009}, 1224 address = {New York, NY}, 1225 } 1226 1227 @Article{ list05tijah, 1228 author = {Johan List and Vojkan Mihajlovic and Georgina Ram{\'\i}rez 1229 and Arjen P. Vries and Djoerd Hiemstra and Henk Ernst Blok}, 1230 title = {{TIJAH}: {E}mbracing {IR} Methods in {XML} Databases}, 1231 journal = {IR}, 1232 volume = {8}, 1233 number = {4}, 1234 year = {2005}, 1235 issn = {1386-4564}, 1236 pages = {547--570}, 1237 doi = {dx.doi.org/10.1007/s10791-005-0747-2}, 1238 publisher = {Kluwer}, 1239 address = {Hingham, MA, USA}, 1240 } 1241 1242 @Article{ larson05fusion, 1243 author = {Ray R. Larson}, 1244 title = {A Fusion Approach to {XML} Structured Document Retrieval}, 1245 journal = {IR}, 1246 volume = {8}, 1247 number = {4}, 1248 year = {2005}, 1249 pages = {601-629}, 1250 doi = {dx.doi.org/10.1007/s10791-005-0749-0}, 1251 bibsource = {DBLP, http://dblp.uni-trier.de}, 1252 } 1253 1254 @Article{ kazai06extended, 1255 author = {Gabriella Kazai and Mounia Lalmas}, 1256 title = {{eXtended} cumulated gain measures for the evaluation of 1257 content-oriented {XML} retrieval}, 1258 journal = {TOIS}, 1259 volume = {24}, 1260 number = {4}, 1261 year = {2006}, 1262 pages = {503-542}, 1263 doi = {doi.acm.org/10.1145/1185883}, 1264 bibsource = {DBLP, http://dblp.uni-trier.de}, 1265 } 1266 1267 @InProceedings{ lalmas07inex, 1268 author = {Mounia Lalmas and Gabriella Kazai and Jaap Kamps and Jovan 1269 Pehcevski and Benjamin Piwowarski and Stephen E. Robertson}, 1270 title = {{INEX} 2006 evaluation measures}, 1271 crossref = {fuhr07comparative}, 1272 year = {2007}, 1273 pages = {20--34}, 1274 } 1275 1276 @Proceedings{ trotman07focused, 1277 editor = {Andrew Trotman and Shlomo Geva and Jaap Kamps}, 1278 title = {SIGIR Workshop on Focused 1279 Retrieval}, 1280 year = {2007}, 1281 publisher = {University of Otago}, 1282 location = {Dunedin, New Zealand}, 1283 } 1284 1285 @Book{ fuhr07comparative, 1286 title = "{Comparative Evaluation of {XML} Information Retrieval 1287 Systems, 5th International Workshop of the Initiative for 1288 the Evaluation of {XML} Retrieval, {INEX} 2006}", 1289 year = 2007, 1290 editor = {Norbert Fuhr and Mounia Lalmas and Andrew Trotman}, 1291 address = {Heidelberg}, 1292 publisher = {Springer}, 1293 } 1294 1295 % number = 4518, 1296 % series = {Lecture Notes in Computer Science/Lecture Notes in 1297 % Artificial Intelligence (LNCS/LNAI)}, 1298 1299 @InProceedings{ okeefe04simplest, 1300 author = {Richard A. O'Keefe and Andrew Trotman}, 1301 title = {The simplest query language that could possibly work}, 1302 booktitle = {Proc. INEX}, 1303 year = {2004}, 1304 pages = {167--174}, 1305 crossref = {fuhr04advances}, 1306 } 1307 1308 @Article{ ameryahia06xml, 1309 author = {Sihem Amer-Yahia and Mounia Lalmas}, 1310 title = {{{XML}} search: {L}anguages, {INEX} and scoring}, 1311 publisher = {ACM Press}, 1312 journal = {SIGMOD Record}, 1313 volume = {35}, 1314 number = {4}, 1315 year = {2006}, 1316 issn = {0163-5808}, 1317 pages = {16--23}, 1318 doi = {doi.acm.org/10.1145/1228268.1228271}, 1319 address = {New York, NY}, 1320 } 1321 1322 @InProceedings{ theobald05topx, 1323 author = {Martin Theobald and Ralf Schenkel and Gerhard Weikum}, 1324 title = {An efficient and versatile query engine for {TopX} search}, 1325 booktitle = {Proc. VLDB}, 1326 year = {2005}, 1327 isbn = {1-59593-154-6}, 1328 pages = {625--636}, 1329 location = {Trondheim}, 1330 publisher = {VLDB Endowment}, 1331 } 1332 1333 @Proceedings{ fuhr03inex2002, 1334 title = {{INitiative for the Evaluation of {XML}} Retrieval ({INEX}). 1335 Proc. First {INEX} Workshop}, 1336 editor = {Norbert Fuhr and Norbert G{\"o}vert and Gabriella Kazai 1337 and Mounia Lalmas}, 1338 address = {Sophia Antipolis, France}, 1339 publisher = {ERCIM}, 1340 booktitle = {Proc. INEX 2002}, 1341 entrydate = 20030226, 1342 month = {March}, 1343 year = 2003, 1344 } 1345 1346 % series = {ERCIM Workshop Proceedings}, 1347 1348 @Proceedings{ fuhr03inex, 1349 title = {{INEX} 2003 Workshop}, 1350 year = {2003}, 1351 editor = {Norbert Fuhr and Saadia Malik and Mounia Lalmas}, 1352 url = {inex.is.informatik.uni-duisburg.de:2003/proceedings.pdf} 1353 , 1354 } 1355 1356 @Proceedings{ fuhr05advances, 1357 editor = {Norbert Fuhr and Mounia Lalmas and Saadia Malik and 1358 Gabriella Kazai}, 1359 title = {Advances in {XML} Information Retrieval and Evaluation, 1360 4th International Workshop of the Initiative for the 1361 Evaluation of {XML} Retrieval, {INEX} 2005}, 1362 booktitle = {Proc. INEX}, 1363 publisher = {Springer}, 1364 year = {2006}, 1365 isbn = {3-540-34962-6}, 1366 bibsource = {DBLP, http://dblp.uni-trier.de}, 1367 } 1368 1369 % series = {Lecture Notes in Computer Science}, 1370 % volume = {3977}, 1371 1372 @Article{ kamps06articulating, 1373 author = {Jaap Kamps and Maarten Marx and Maarten de Rijke and 1374 B{\"o}rkur Sigurbj{\"o}rnsson}, 1375 title = {Articulating information needs in {XML} query languages}, 1376 publisher = {ACM Press}, 1377 journal = {TOIS}, 1378 volume = {24}, 1379 number = {4}, 1380 year = {2006}, 1381 issn = {1046-8188}, 1382 pages = {407--436}, 1383 doi = {doi.acm.org/10.1145/1185877.1185879}, 1384 address = {New York, NY}, 1385 } 1386 1387 @InProceedings{ mccallum98improving, 1388 author = {Andrew McCallum and Ronald Rosenfeld and Tom M. Mitchell 1389 and Andrew Y. Ng}, 1390 title = {Improving Text Classification by Shrinkage in a Hierarchy 1391 of Classes}, 1392 booktitle = {Proc. ICML}, 1393 year = {1998}, 1394 isbn = {1-55860-556-8}, 1395 pages = {359--367}, 1396 publisher = {Morgan Kaufmann}, 1397 address = {San Francisco, CA}, 1398 } 1399 1400 @Book{ fuhr04advances, 1401 editor = {Norbert Fuhr and Mounia Lalmas and Saadia Malik and 1402 Zolt{\'a}n Szl{\'a}vik}, 1403 title = {Advances in {XML} Information Retrieval, Third International 1404 Workshop of the Initiative for the Evaluation of {XML} 1405 Retrieval, INEX 2004}, 1406 booktitle = {Proc. INEX}, 1407 publisher = {Springer}, 1408 year = {2005}, 1409 isbn = {3-540-26166-4}, 1410 bibsource = {DBLP, http://dblp.uni-trier.de}, 1411 } 1412 1413 % series = {Lecture Notes in Computer Science}, 1414 % volume = {3493}, 1415 1416 @InProceedings{ trotman04narrowed, 1417 author = {Andrew Trotman and B{\"o}rkur Sigurbj{\"o}rnsson}, 1418 title = {Narrowed {E}xtended {XP}ath {I} ({NEXI})}, 1419 booktitle = {Proc. INEX}, 1420 year = {2004}, 1421 pages = {16--40}, 1422 doi = {dx.doi.org/10.1007/11424550\_2}, 1423 crossref = {fuhr04advances}, 1424 } 1425 1426 @InProceedings{ fuhr07advances, 1427 author = {Norbert Fuhr and Mounia Lalmas}, 1428 title = {Advances in {XML} Retrieval: {T}he {INEX} Initiative}, 1429 booktitle = {International Workshop on Research Issues in Digital Libraries}, 1430 year = 2007, 1431 } 1432 1433 @TechReport{singhal95length, 1434 author = {Amit Singhal and Gerard Salton and Chris Buckley}, 1435 title = {Length Normalization in Degraded Text Collections}, 1436 year = {1995}, 1437 institution = {Cornell University}, 1438 address = {Ithaca, NY}, 1439 } 1440 1441 @InProceedings{singhal96length, 1442 title = {Length Normalization in Degraded Text Collections}, 1443 author = {Amit Singhal and Gerard Salton and Chris Buckley}, 1444 booktitle = {Proc. SDAIR}, 1445 pages = {149--162}, 1446 year = 1996, 1447 } 1448 1449 @InProceedings{bast05spectral, 1450 author = {Holger Bast and Debapriyo Majumdar}, 1451 title = {Why spectral retrieval works}, 1452 booktitle = {Proc. SIGIR}, 1453 publisher = {ACM Press}, 1454 year = {2005}, 1455 isbn = {1-59593-034-5}, 1456 pages = {11--18}, 1457 location = {Salvador, Brazil}, 1458 doi = {doi.acm.org/10.1145/1076034.1076040}, 1459 address = {New York, NY}, 1460 } 1461 1462 @Article{ moffat95insitu, 1463 author = {Alistair Moffat and Timothy A. H. Bell}, 1464 title = {In situ generation of compressed inverted files}, 1465 journal = {JASIS}, 1466 volume = {46}, 1467 number = {7}, 1468 year = {1995}, 1469 issn = {0002-8231}, 1470 pages = {537--550}, 1471 publisher = {John Wiley \& Sons}, 1472 address = {New York, NY}, 1473 } 1474 1475 @InProceedings{ zukowski06superscalar, 1476 author = {Marcin Zukowski and Sandor Heman and Niels Nes and Peter 1477 Boncz}, 1478 title = {Super-Scalar {RAM-CPU} Cache Compression}, 1479 booktitle = {Proc. International Conference on Data Engineering}, 1480 year = {2006}, 1481 isbn = {0-7695-2570-9}, 1482 pages = {59}, 1483 doi = {dx.doi.org/10.1109/ICDE.2006.150}, 1484 publisher = {IEEE Computer Society}, 1485 address = {Washington, DC, USA}, 1486 } 1487 1488 @InProceedings{ dom02information, 1489 author = {Byron E. Dom}, 1490 title = {An Information-Theoretic External Cluster-Validity 1491 Measure}, 1492 booktitle = {Proc. UAI}, 1493 month = {August}, 1494 year = {2002}, 1495 } 1496 1497 @InProceedings{ blanco07boosting, 1498 title = {Boosting Static Pruning of Inverted Files}, 1499 author = {Roi Blanco and Alvaro Barreiro}, 1500 year = 2007, 1501 booktitle = {Proc. SIGIR}, 1502 publisher = {ACM Press}, 1503 } 1504 1505 @Article{ weiss05concept, 1506 author = "Stanis{\l}aw Osi{\'n}ski and Dawid Weiss", 1507 title = {A Concept-Driven Algorithm for Clustering Search 1508 Results}, 1509 journal = {IEEE Intelligent Systems}, 1510 number = "3", 1511 volume = "20", 1512 pages = "48--54", 1513 year = "2005", 1514 } 1515 1516 @inproceedings{ arthur06worstcase, 1517 author = {David Arthur and 1518 Sergei Vassilvitskii}, 1519 title = {How slow is the {\it k}-means method?}, 1520 booktitle = {Proc. ACM Symposium on Computational Geometry}, 1521 year = {2006}, 1522 pages = {144-153}, 1523 ee = {doi.acm.org/10.1145/1137856.1137880}, 1524 bibsource = {DBLP, http://dblp.uni-trier.de} 1525 } 1526 1527 @Book{ fellbaum98wordnet, 1528 title = {WordNet -- An Electronic Lexical Database}, 1529 author = {Christiane D. Fellbaum}, 1530 publisher = {MIT Press}, 1531 year = {1998}, 1532 } 1533 1534 @Article{ fowlkes83clusterings, 1535 author = "Edward B. Fowlkes and Colin L. Mallows", 1536 title = "A Method for Comparing Two Hierarchical Clusterings", 1537 journal = {Journal of the American Statistical Association}, 1538 volume = "78", 1539 year = "1983", 1540 number = "383", 1541 pages = "553-569", 1542 url = {www.jstor.org/view/01621459/di985957/98p0926l/0}, 1543 } 1544 1545 @InProceedings{ kleinberg02impossibility, 1546 title = {An Impossibility Theorem for Clustering}, 1547 author = {Jon M. Kleinberg}, 1548 year = {2002}, 1549 booktitle = {Proc. NIPS}, 1550 } 1551 1552 @InProceedings{ meila05clusterings, 1553 author = "Marina Meil\u{a}", 1554 title = "Comparing clusterings -- {A}n axiomatic view", 1555 booktitle = {Proc. ICML}, 1556 year = "2005", 1557 address = "Bonn", 1558 pages = "", 1559 } 1560 1561 @Article{ savaresi04pddp, 1562 author = {Sergio M. Savaresi and Daniel Boley}, 1563 title = {A comparative analysis on the bisecting {K}-means and the 1564 {PDDP} clustering algorithms.}, 1565 journal = {Intelligent Data Analysis}, 1566 volume = {8}, 1567 number = {4}, 1568 year = {2004}, 1569 pages = {345--362}, 1570 } 1571 1572 @Article{ castro04likelihood, 1573 author = {R. M. Castro and M. J. Coates and R. D. Nowak}, 1574 title = {Likelihood Based Hierarchical Clustering}, 1575 journal = {IEEE Transactions in Signal Processing}, 1576 volume = 52, 1577 number = 8, 1578 year = {2004}, 1579 pages = {2308--2321}, 1580 } 1581 1582 @InProceedings{ kamvar02interpreting, 1583 author = {Sepandar D. Kamvar and Dan Klein and Christopher D. 1584 Manning}, 1585 title = {Interpreting and Extending Classical Agglomerative 1586 Clustering Algorithms using a Model-Based approach}, 1587 booktitle = {Proc. ICML}, 1588 year = {2002}, 1589 isbn = {1-55860-873-7}, 1590 pages = {283--290}, 1591 publisher = {Morgan Kaufmann}, 1592 address = {San Francisco, CA}, 1593 } 1594 1595 @Book{ mclachlan96em, 1596 title = {The {EM} Algorithm and Extensions}, 1597 author = {Geoffrey J. McLachlan and Thiriyambakam Krishnan}, 1598 year = 1996, 1599 publisher = {John Wiley \& Sons}, 1600 } 1601 1602 @Article{ blei03latent, 1603 author = {David M. Blei and Andrew Y. Ng and Michael I. Jordan}, 1604 title = {Latent {D}irichlet allocation}, 1605 journal = {JMLR}, 1606 volume = {3}, 1607 year = {2003}, 1608 issn = {1533-7928}, 1609 pages = {993--1022}, 1610 publisher = {MIT Press}, 1611 address = {Cambridge, MA, USA}, 1612 } 1613 1614 @Book{ rice06statistics, 1615 author = {John A. Rice}, 1616 title = {Mathematical Statistics and Data Analysis}, 1617 publisher = {Duxbury Press}, 1618 year = 2006, 1619 } 1620 1621 @Book{ sheldon06probability, 1622 author = {Sheldon Ross}, 1623 title = {A First Course in Probability}, 1624 publisher = {Pearson Prentice Hall}, 1625 year = 2006, 1626 } 1627 1628 @InProceedings{ buttcher06document, 1629 author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, 1630 title = {A document-centric approach to static index pruning in text retrieval systems}, 1631 booktitle = {Proc. CIKM}, 1632 year = {2006}, 1633 isbn = {1-59593-433-2}, 1634 pages = {182--189}, 1635 doi = {doi.acm.org/10.1145/1183614.1183644}, 1636 } 1637 % publisher = {ACM Press}, 1638 % location = {Arlington, VA}, 1639 % address = {New York, NY}, 1640 1641 @Article{ trotman03compressing, 1642 author = {Andrew Trotman}, 1643 title = {Compressing Inverted Files}, 1644 journal = {IR}, 1645 volume = {6}, 1646 number = {1}, 1647 year = {2003}, 1648 issn = {1386-4564}, 1649 pages = {5--19}, 1650 doi = {dx.doi.org/10.1023/A:1022949613039}, 1651 publisher = {Kluwer}, 1652 address = {Hingham, MA}, 1653 } 1654 1655 @Book{ cover91elements, 1656 author = {Thomas M. Cover and Joy A. Thomas}, 1657 title = {Elements of Information Theory}, 1658 publisher = {Wiley}, 1659 year = {1991}, 1660 address = {New York}, 1661 } 1662 1663 @Article{ barroso03web, 1664 author = {Luiz Andr{\'e} Barroso and Jeffrey Dean and Urs H{\"o}lzle}, 1665 title = {Web Search for a Planet: {T}he {G}oogle Cluster 1666 Architecture}, 1667 journal = {IEEE Micro}, 1668 volume = {23}, 1669 number = {2}, 1670 year = {2003}, 1671 issn = {0272-1732}, 1672 pages = {22--28}, 1673 doi = {dx.doi.org/10.1109/MM.2003.1196112}, 1674 publisher = {IEEE Computer Society Press}, 1675 address = {Los Alamitos, CA}, 1676 } 1677 1678 @Book{ comtet74advanced, 1679 author = {Louis Comtet}, 1680 publisher = {Reidel}, 1681 title = {Advanced Combinatorics}, 1682 year = {1974}, 1683 } 1684 1685 @InProceedings{ ball65data, 1686 author = {G. H. Ball}, 1687 title = {Data analysis in the social sciences: {W}hat about the 1688 details?}, 1689 booktitle = {Proc. Fall Joint Computer Conference}, 1690 publisher = {Spartan Books}, 1691 pages = {533--560}, 1692 year = 1965, 1693 } 1694 1695 @Book{ burnham02model, 1696 author = {Kenneth P. Burnham and David Anderson }, 1697 citeulike-article-id={157697}, 1698 isbn = {0387953647}, 1699 publisher = {Springer}, 1700 title = {Model Selection and Multi-Model Inference}, 1701 year = {2002}, 1702 } 1703 1704 @Article{ hartigan79kmeans, 1705 author = {J. A. Hartigan and M. A. Wong}, 1706 title = {A {K}-Means Clustering Algorithm}, 1707 journal = {Applied Statistics}, 1708 volume = 28, 1709 pages = {100--108}, 1710 entrydate = 20030618, 1711 key = {Hartigan/Wong:79}, 1712 year = 1979, 1713 } 1714 1715 @InProceedings{ basu04active, 1716 title = {Active Semi-Supervision for Pairwise Constrained 1717 Clustering}, 1718 address = {Lake Buena Vista, FL}, 1719 author = {Sugato Basu and Arindam Banerjee and Raymond J. Mooney}, 1720 booktitle = {Proc. SIAM International Conference on Data Mining}, 1721 pages = {333--344}, 1722 year = {2004}, 1723 } 1724 1725 @InProceedings{ huang06text, 1726 author = {Yifen Huang and Tom M. Mitchell}, 1727 title = {Text clustering with extended user feedback}, 1728 booktitle = {Proc. SIGIR}, 1729 publisher = {ACM Press}, 1730 year = {2006}, 1731 isbn = {1-59593-369-7}, 1732 pages = {413--420}, 1733 location = {Seattle, WA}, 1734 doi = {doi.acm.org/10.1145/1148170.1148242}, 1735 address = {New York, NY}, 1736 } 1737 1738 @InProceedings{ crouch88cluster, 1739 author = {Carolyn J. Crouch}, 1740 title = {A cluster-based approach to thesaurus construction}, 1741 booktitle = {Proc. SIGIR}, 1742 publisher = {ACM Press}, 1743 year = {1988}, 1744 isbn = {2-7061-0309-4}, 1745 pages = {309--320}, 1746 location = {Grenoble}, 1747 doi = {doi.acm.org/10.1145/62437.62467}, 1748 address = {New York, NY}, 1749 } 1750 1751 @InProceedings{ schuetze95information, 1752 author = {Hinrich Sch{\"u}tze and Jan O. Pedersen}, 1753 title = {Information Retrieval Based on Word Senses}, 1754 year = 1995, 1755 booktitle = {Proc. SDAIR}, 1756 address = {Las Vegas, NV}, 1757 pages = {161--175}, 1758 } 1759 1760 @Book{ witten05data, 1761 title = {Data Mining: {P}ractical Machine Learning Tools and 1762 Techniques}, 1763 author = {Ian H. Witten and Eibe Frank}, 1764 edition = {2nd}, 1765 howpublished = {Paperback}, 1766 month = {June}, 1767 publisher = {Morgan Kaufmann}, 1768 year = {2005}, 1769 isbn = {0120884070}, 1770 citeulike-article-id={340715}, 1771 priority = {0}, 1772 keywords = {weka data mining da }, 1773 } 1774 1775 % series = {Morgan Kaufmann Series in Data Management Sys}, 1776 1777 @InCollection{ cheeseman96bayesian, 1778 author = {Peter Cheeseman and John Stutz}, 1779 title = {Bayesian Classification ({AutoClass}): {T}heory and Results.}, 1780 booktitle = {Advances in Knowledge Discovery and Data Mining}, 1781 year = {1996}, 1782 pages = {153-180}, 1783 publisher = {MIT Press}, 1784 bibsource = {DBLP, http://dblp.uni-trier.de}, 1785 } 1786 1787 @Unpublished{ mccallum96bow, 1788 author = "Andrew Kachites McCallum", 1789 title = {Bow: {A} toolkit for statistical language modeling, text 1790 retrieval, classification and clustering}, 1791 note = "\url{www.cs.cmu.edu/~mccallum/bow}", 1792 year = 1996, 1793 } 1794 1795 @InProceedings{ picca06nonlinear, 1796 author = {Davide Picca and Beno\^{i}t Curdy and Fran\c{c}ois Bavaud}, 1797 title = {Non-linear correspondence analysis in text retrieval: {A} 1798 kernel view}, 1799 booktitle = {Proc. JADT}, 1800 year = {2006}, 1801 } 1802 1803 @Book{ bishop06pattern, 1804 author = {Christopher M. Bishop}, 1805 title = { Pattern Recognition and Machine Learning}, 1806 publisher = {Springer}, 1807 year = 2006, 1808 } 1809 1810 @InProceedings{ ghamrawi05collective, 1811 author = {Nadia Ghamrawi and Andrew McCallum}, 1812 title = {Collective multi-label classification}, 1813 booktitle = {Proc. CIKM}, 1814 publisher = {ACM Press}, 1815 year = {2005}, 1816 isbn = {1-59593-140-6}, 1817 pages = {195--200}, 1818 location = {Bremen}, 1819 doi = {doi.acm.org/10.1145/1099554.1099591}, 1820 address = {New York, NY}, 1821 } 1822 1823 @Article{ geman92neural, 1824 author = {Stuart Geman and Elie Bienenstock and Ren{\'e} Doursat}, 1825 title = {Neural networks and the bias/variance dilemma}, 1826 journal = {Neural Computation}, 1827 volume = {4}, 1828 number = {1}, 1829 year = {1992}, 1830 issn = {0899-7667}, 1831 pages = {1--58}, 1832 publisher = {MIT Press}, 1833 address = {Cambridge, MA}, 1834 } 1835 1836 @InProceedings{ anagnostopoulos06effective, 1837 author = {Aris Anagnostopoulos and Andrei Z. Broder and Kunal 1838 Punera}, 1839 title = {Effective and efficient classification on a search-engine 1840 model}, 1841 booktitle = {Proc. CIKM}, 1842 publisher = {ACM Press}, 1843 year = {2006}, 1844 isbn = {1-59593-433-2}, 1845 pages = {208--217}, 1846 location = {Arlington, VA}, 1847 doi = {doi.acm.org/10.1145/1183614.1183648}, 1848 address = {New York, NY}, 1849 } 1850 1851 @Article{ rahm01survey, 1852 author = "Erhard Rahm and Philip A. Bernstein", 1853 title = "A survey of approaches to automatic schema matching", 1854 journal = {VLDB Journal}, 1855 volume = "10", 1856 number = "4", 1857 month = "????", 1858 pages = "334--350", 1859 year = "2001", 1860 url = "citeseer.ist.psu.edu/rahm01survey.html", 1861 } 1862 1863 @InProceedings{ hatzivassiloglou00linguistic, 1864 author = {Vasileios Hatzivassiloglou and Luis Gravano and Ankineedu 1865 Maganti}, 1866 title = {An investigation of linguistic features and clustering 1867 algorithms for topical document clustering}, 1868 booktitle = {Proc. SIGIR}, 1869 publisher = {ACM Press}, 1870 year = {2000}, 1871 isbn = {1-58113-226-3}, 1872 pages = {224--231}, 1873 location = {Athens}, 1874 doi = {doi.acm.org/10.1145/345508.345582}, 1875 address = {New York, NY}, 1876 } 1877 1878 @Article{ lance67general, 1879 author = "G. N. Lance and W. T. Williams", 1880 title = "A general theory of classificatory sorting strategies 1. 1881 {Hierarchical} systems", 1882 journal = {Computer Journal}, 1883 volume = "9", 1884 number = "4", 1885 pages = "373--380", 1886 month = feb, 1887 year = "1967", 1888 coden = "CMPJA6", 1889 issn = "0010-4620", 1890 } 1891 1892 @InProceedings{ sahoo06incremental, 1893 author = {Nachiketa Sahoo and Jamie Callan and Ramayya Krishnan and 1894 George Duncan and Rema Padman}, 1895 title = {Incremental hierarchical clustering of text documents}, 1896 booktitle = {Proc. CIKM}, 1897 year = {2006}, 1898 isbn = {1-59593-433-2}, 1899 pages = {357--366}, 1900 doi = {doi.acm.org/10.1145/1183614.1183667}, 1901 } 1902 % publisher = {ACM Press}, 1903 % location = {Arlington, VA}, 1904 % address = {New York, NY}, 1905 1906 @InProceedings{ larsen99fast, 1907 author = {Bjornar Larsen and Chinatsu Aone}, 1908 title = {Fast and effective text mining using linear-time document 1909 clustering}, 1910 booktitle = {Proc. KDD}, 1911 publisher = {ACM Press}, 1912 year = {1999}, 1913 isbn = {1-58113-143-7}, 1914 pages = {16--22}, 1915 location = {San Diego, CA}, 1916 doi = {doi.acm.org/10.1145/312129.312186}, 1917 address = {New York, NY}, 1918 } 1919 1920 @InProceedings{ zhao02evaluation, 1921 author = {Ying Zhao and George Karypis}, 1922 title = {Evaluation of hierarchical clustering algorithms for 1923 document datasets}, 1924 booktitle = {Proc. CIKM}, 1925 publisher = {ACM Press}, 1926 year = {2002}, 1927 isbn = {1-58113-492-4}, 1928 pages = {515--524}, 1929 location = {McLean, VA}, 1930 doi = {doi.acm.org/10.1145/584792.584877}, 1931 address = {New York, NY}, 1932 } 1933 1934 @InProceedings{ buttcher05indexing, 1935 author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, 1936 title = {Indexing time vs. query time: {T}rade-offs in dynamic 1937 information retrieval systems}, 1938 booktitle = {Proc. CIKM}, 1939 publisher = {ACM Press}, 1940 year = {2005}, 1941 isbn = {1-59593-140-6}, 1942 pages = {317--318}, 1943 location = {Bremen}, 1944 doi = {doi.acm.org/10.1145/1099554.1099645}, 1945 address = {New York, NY}, 1946 } 1947 1948 @InProceedings{ forman06tackling, 1949 author = {George Forman}, 1950 title = {Tackling concept drift by temporal inductive transfer}, 1951 booktitle = {Proc. SIGIR}, 1952 publisher = {ACM Press}, 1953 year = {2006}, 1954 isbn = {1-59593-369-7}, 1955 pages = {252--259}, 1956 location = {Seattle, WA}, 1957 doi = {doi.acm.org/10.1145/1148170.1148216}, 1958 address = {New York, NY}, 1959 } 1960 1961 @Article{ brisaboa06lightweight, 1962 title = {Lightweight Natural Language Text Compression}, 1963 author = {Nieves R. Brisaboa and Antonio Fari{\~n}a and Gonzalo 1964 Navarro and Jos{\'e} R. Param{\'a}}, 1965 journal = {IR}, 1966 year = 2007, 1967 volume = 10, 1968 number = 1, 1969 pages = {1--33}, 1970 } 1971 1972 @InProceedings{ buttcher05security, 1973 author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, 1974 title = {A Security Model for Full-Text File System Search in 1975 Multi-User Environments.}, 1976 booktitle = {Proc. FAST}, 1977 year = {2005}, 1978 url = {www.usenix.org/events/fast05/tech/buettcher.html}, 1979 } 1980 1981 @Book{ heaps78information, 1982 author = {Heaps, Harold S.}, 1983 year = {1978}, 1984 title = {Information Retrieval: {C}omputational and Theoretical 1985 Aspects}, 1986 publisher = {Academic Press}, 1987 address = {New York}, 1988 } 1989 1990 @Article{ anh06improved, 1991 author = {Vo Ngoc Anh and Alistair Moffat}, 1992 title = "Improved Word-Aligned Binary Compression for Text 1993 Indexing", 1994 journal = {IEEE Transactions on Knowledge and Data Engineering}, 1995 year = 2006, 1996 volume = 18, 1997 number = 6, 1998 pages = "857-861", 1999 } 2000 2001 @InProceedings{ buckley94automatic, 2002 author = "Chris Buckley and James Allan and Gerard Salton", 2003 title = "Automatic Routing and Ad-hoc Retrieval using {SMART}: {TREC} 2004 2", 2005 booktitle = {Proc. TREC}, 2006 pages = "45--55", 2007 year = 1994, 2008 } 2009 2010 @InProceedings{ schapire98boosting, 2011 author = {Robert E. Schapire and Yoram Singer and Amit Singhal}, 2012 title = {Boosting and {R}occhio Applied to Text Filtering}, 2013 year = 1998, 2014 pages = {215--223}, 2015 booktitle = {Proc. SIGIR}, 2016 publisher = {ACM Press}, 2017 } 2018 2019 @InProceedings{ ittner95text, 2020 author = {David J. Ittner and David D. Lewis and David D. Ahn}, 2021 title = {Text categorization of low quality images}, 2022 booktitle = {Proc. SDAIR}, 2023 publisher = {}, 2024 editor = {}, 2025 year = {1995}, 2026 address = {Las Vegas, US}, 2027 pages = {301--315}, 2028 } 2029 2030 @InProceedings{ alonso06gio, 2031 author = {Omar Alonso and Sandeepan Banerjee and Mark Drake}, 2032 title = {{GIO}: {A} semantic web application using the information grid 2033 framework}, 2034 booktitle = {Proc. WWW}, 2035 publisher = {ACM Press}, 2036 year = {2006}, 2037 isbn = {1-59593-323-9}, 2038 pages = {857--858}, 2039 location = {Edinburgh}, 2040 doi = {doi.acm.org/10.1145/1135777.1135913}, 2041 address = {New York, NY}, 2042 } 2043 2044 @InProceedings{ toda05search, 2045 author = {Hiroyuki Toda and Ryoji Kataoka}, 2046 title = {A search result clustering method using informatively 2047 named entities}, 2048 booktitle = {International Workshop on Web Information and Data Management}, 2049 publisher = {ACM Press}, 2050 year = {2005}, 2051 isbn = {1-59593-194-5}, 2052 pages = {81--86}, 2053 location = {Bremen}, 2054 doi = {doi.acm.org/10.1145/1097047.1097063}, 2055 address = {New York, NY}, 2056 } 2057 2058 @InProceedings{ ogilvie05parameter, 2059 author = {Paul Ogilvie and Jamie Callan}, 2060 title = {Parameter Estimation for a Simple Hierarchical Generative 2061 Model for {XML} Retrieval}, 2062 booktitle = {Proc. INEX}, 2063 year = {2005}, 2064 pages = {211-224}, 2065 doi = {dx.doi.org/10.1007/11766278\_16}, 2066 bibsource = {DBLP, http://dblp.uni-trier.de}, 2067 } 2068 2069 @Article{ witten90source, 2070 author = {Ian H. Witten and Timothy C. Bell}, 2071 title = {Source models for natural language text}, 2072 journal = {International Journal Man-Machine Studies}, 2073 volume = {32}, 2074 number = {5}, 2075 year = {1990}, 2076 issn = {0020-7373}, 2077 pages = {545--579}, 2078 publisher = {Academic Press}, 2079 address = {London, UK, UK}, 2080 } 2081 2082 @InProceedings{ kleinberg97two, 2083 author = {Jon M. Kleinberg}, 2084 title = {Two algorithms for nearest-neighbor search in high 2085 dimensions}, 2086 booktitle = {Proc. ACM Symposium on Theory of Computing}, 2087 publisher = {ACM Press}, 2088 year = {1997}, 2089 isbn = {0-89791-888-6}, 2090 pages = {599--608}, 2091 location = {El Paso, TX}, 2092 doi = {doi.acm.org/10.1145/258533.258653}, 2093 address = {New York, NY}, 2094 } 2095 2096 @InCollection{anh06structured, 2097 title = {Structured Index Organizations for High-Throughput Text 2098 Querying}, 2099 booktitle = {Proc. SPIRE}, 2100 author = {Vo Ngoc Anh and Alistair Moffat}, 2101 publisher = {Springer}, 2102 pages = {304-315}, 2103 year = 2006, 2104 } 2105 2106 % series = {Lecture Notes in Computer Science}, 2107 % volume = 4209, 2108 2109 @InProceedings{ koenemann96interaction, 2110 author = {J{\"u}rgen Koenemann and Nicholas J. Belkin}, 2111 title = {A case for interaction: {A} study of interactive information 2112 retrieval behavior and effectiveness}, 2113 booktitle = {Proc. SIGCHI}, 2114 publisher = {ACM Press}, 2115 year = {1996}, 2116 isbn = {0-89791-777-4}, 2117 pages = {205--212}, 2118 location = {Vancouver}, 2119 doi = {doi.acm.org/10.1145/238386.238487}, 2120 address = {New York, NY}, 2121 } 2122 2123 @Article{ dieugenio04kappa, 2124 author = {Barbara {Di Eugenio} and Michael Glass}, 2125 title = {The Kappa Statistic: {A} Second Look.}, 2126 journal = {Computational Linguistics}, 2127 volume = {30}, 2128 number = {1}, 2129 year = {2004}, 2130 pages = {95-101}, 2131 doi = {dx.doi.org/10.1162/089120104773633402}, 2132 bibsource = {DBLP, http://dblp.uni-trier.de}, 2133 } 2134 2135 @InProceedings{singitham04efficiency, 2136 author = {Pavan Kumar C. Singitham and Mahathi S. Mahabhashyam and Prabhakar Raghavan}, 2137 title = {Efficiency-Quality Tradeoffs for Vector Score Aggregation}, 2138 booktitle = {Proc. VLDB}, 2139 year = {2004}, 2140 pages = {624-635}, 2141 url = {www.vldb.org/conf/2004/RS17P1.PDF}, 2142 bibsource = {DBLP, http://dblp.uni-trier.de}, 2143 } 2144 2145 @InProceedings{ buttcher06hybrid, 2146 author = {Stefan B{\"u}ttcher and Charles L. A. Clarke and Brad Lushman}, 2147 title = {Hybrid index maintenance for growing text collections}, 2148 booktitle = {Proc. SIGIR}, 2149 publisher = {ACM Press}, 2150 year = {2006}, 2151 isbn = {1-59593-369-7}, 2152 pages = {356--363}, 2153 location = {Seattle, WA}, 2154 doi = {doi.acm.org/10.1145/1148170.1148233}, 2155 address = {New York, NY}, 2156 } 2157 2158 @Article{ heinz02burst, 2159 author = {Steffen Heinz and Justin Zobel and Hugh E. Williams}, 2160 title = {Burst tries: {A} fast, efficient data structure for string 2161 keys}, 2162 publisher = {ACM Press}, 2163 journal = {TOIS}, 2164 volume = {20}, 2165 number = {2}, 2166 year = {2002}, 2167 issn = {1046-8188}, 2168 pages = {192--223}, 2169 doi = {doi.acm.org/10.1145/506309.506312}, 2170 address = {New York, NY}, 2171 } 2172 2173 @InProceedings{ ribeiro99efficient, 2174 author = {Berthier Ribeiro-Neto and Edleno S. Moura and Marden S. 2175 Neubert and Nivio Ziviani}, 2176 title = {Efficient distributed algorithms to build inverted files}, 2177 booktitle = {Proc. SIGIR}, 2178 publisher = {ACM Press}, 2179 year = {1999}, 2180 isbn = {1-58113-096-1}, 2181 pages = {105--112}, 2182 location = {Berkeley, CA}, 2183 doi = {doi.acm.org/10.1145/312624.312663}, 2184 address = {New York, NY}, 2185 } 2186 2187 @InProceedings{ melnik01building, 2188 author = {Sergey Melnik and Sriram Raghavan and Beverly Yang and 2189 Hector Garcia-Molina}, 2190 title = {Building a distributed full-text index for the Web}, 2191 booktitle = {Proc. WWW}, 2192 publisher = {ACM Press}, 2193 year = {2001}, 2194 isbn = {1-58113-348-0}, 2195 pages = {396--406}, 2196 location = {Hong Kong}, 2197 doi = {doi.acm.org/10.1145/371920.372095}, 2198 address = {New York, NY}, 2199 } 2200 2201 @Article{ lester06efficient, 2202 author = {Nicholas Lester and Justin Zobel and Hugh E. Williams}, 2203 title = {Efficient online index maintenance for contiguous inverted 2204 lists.}, 2205 journal = {IP\&M}, 2206 volume = {42}, 2207 number = {4}, 2208 year = {2006}, 2209 pages = {916--933}, 2210 doi = {dx.doi.org/10.1016/j.ipm.2005.09.005}, 2211 bibsource = {DBLP, http://dblp.uni-trier.de}, 2212 } 2213 2214 @Article{ williams05searchable, 2215 author = {Hugh E. Williams and Justin Zobel}, 2216 title = {Searchable words on the Web}, 2217 journal = {International Journal on Digital Libraries}, 2218 volume = {5}, 2219 number = {2}, 2220 year = {2005}, 2221 pages = {99-105}, 2222 doi = {dx.doi.org/10.1007/s00799-003-0050-z}, 2223 bibsource = {DBLP, http://dblp.uni-trier.de}, 2224 } 2225 2226 @Article{ heinz03efficient, 2227 author = {Steffen Heinz and Justin Zobel}, 2228 title = {Efficient single-pass index construction for text 2229 databases}, 2230 journal = {JASIST}, 2231 volume = {54}, 2232 number = {8}, 2233 year = {2003}, 2234 issn = {1532-2882}, 2235 pages = {713--729}, 2236 doi = {dx.doi.org/10.1002/asi.10268}, 2237 publisher = {John Wiley \& Sons}, 2238 address = {New York, NY}, 2239 } 2240 2241 @InProceedings{ lester05fast, 2242 author = {Nicholas Lester and Alistair Moffat and Justin Zobel}, 2243 title = {Fast on-line index construction by geometric 2244 partitioning}, 2245 booktitle = {Proc. CIKM}, 2246 publisher = {ACM Press}, 2247 year = {2005}, 2248 isbn = {1-59593-140-6}, 2249 pages = {776--783}, 2250 location = {Bremen}, 2251 doi = {doi.acm.org/10.1145/1099554.1099739}, 2252 address = {New York, NY}, 2253 } 2254 2255 @InProceedings{ treeratpituk06experimental, 2256 author = {Pucktada Treeratpituk and Jamie Callan}, 2257 title = {An experimental study on automatically labeling 2258 hierarchical clusters using statistical features}, 2259 booktitle = {Proc. SIGIR}, 2260 publisher = {ACM Press}, 2261 year = {2006}, 2262 isbn = {1-59593-369-7}, 2263 pages = {707--708}, 2264 location = {Seattle, WA}, 2265 doi = {doi.acm.org/10.1145/1148170.1148328}, 2266 address = {New York, NY}, 2267 } 2268 2269 @Manual{ r05r, 2270 title = {R: {A} language and environment for statistical computing}, 2271 author = {{R Development Core Team}}, 2272 organization = {R Foundation for Statistical Computing}, 2273 address = {Vienna}, 2274 year = {2005}, 2275 note = {{ISBN} 3-900051-07-0}, 2276 url = {www.R-project.org}, 2277 } 2278 2279 2280 2281 @Article{ tombros02effectiveness, 2282 author = {Anastasios Tombros and Robert Villa and Cornelis Joost 2283 {van~Rijsbergen}}, 2284 title = {The effectiveness of query-specific hierarchic clustering 2285 in information retrieval}, 2286 journal = {IP\&M}, 2287 volume = {38}, 2288 number = {4}, 2289 year = {2002}, 2290 issn = {0306-4573}, 2291 pages = {559--582}, 2292 doi = {dx.doi.org/10.1016/S0306-4573(01)00048-6}, 2293 publisher = {Pergamon Press}, 2294 address = {Tarrytown, NY}, 2295 } 2296 2297 @Article{ schwarz78estimating, 2298 author = "Gideon Schwarz", 2299 title = "Estimating the dimension of a model", 2300 journal = {Annals of Statistics}, 2301 year = {1978}, 2302 volume = {6}, 2303 number = 2, 2304 pages = {461--464}, 2305 } 2306 2307 @InProceedings{ pelleg00xmeans, 2308 year = {2000}, 2309 pages = {727-734}, 2310 publisher = {Morgan Kaufmann}, 2311 address = {San Francisco}, 2312 booktitle = {Proc. ICML}, 2313 author = {Dan Pelleg and Andrew Moore}, 2314 title = {X-means: {E}xtending K-means with Efficient Estimation of 2315 the Number of Clusters}, 2316 } 2317 2318 @Article{ akaike74new, 2319 author = "Hirotugu Akaike", 2320 title = "A new look at the statistical model identification", 2321 journal = {{IEEE} Transactions on automatic control}, 2322 year = {1974}, 2323 volume = {19}, 2324 number = 6, 2325 pages = {716-723}, 2326 } 2327 2328 @Article{ tibshirani01estimating, 2329 author = { Robert Tibshirani and Guenther Walther and Trevor 2330 Hastie}, 2331 title = {Estimating the number of clusters in a data set via the 2332 gap statistic}, 2333 journal = {Journal of the Royal Statistical Society Series~B}, 2334 volume = 63, 2335 year = 2001, 2336 pages = { 411--423}, 2337 } 2338 2339 @InProceedings{ bradley98scaling, 2340 author = {Paul S. Bradley and Usama M. Fayyad and Cory Reina}, 2341 title = {Scaling Clustering Algorithms to Large Databases.}, 2342 booktitle = {Proc. KDD}, 2343 year = {1998}, 2344 pages = {9-15}, 2345 bibsource = {DBLP, http://dblp.uni-trier.de}, 2346 } 2347 2348 @InProceedings{ fayyad98initialization, 2349 author = {Usama M. Fayyad and Cory Reina and Paul S. Bradley}, 2350 title = {Initialization of Iterative Refinement Clustering 2351 Algorithms.}, 2352 booktitle = {Proc. KDD}, 2353 year = {1998}, 2354 pages = {194-198}, 2355 bibsource = {DBLP, http://dblp.uni-trier.de}, 2356 } 2357 2358 @inproceedings{ macqueen67some, 2359 author = "James B. MacQueen", 2360 title = "Some methods for classification and analysis of 2361 multivariate observations", 2362 booktitle = {Proc. Berkeley Symposium on Mathematics, Statistics and Probability}, 2363 pages = {281--297}, 2364 year = "1967", 2365 publisher = {University of California Press}, 2366 } 2367 2368 % volume = 1, 2369 2370 @Article{ lloyd82least, 2371 author = {Stuart P. Lloyd}, 2372 title = {Least squares quantization in {PCM}}, 2373 journal = {IEEE Transactions on Information Theory}, 2374 volume = {28}, 2375 number = {2}, 2376 year = {1982}, 2377 pages = {129-136}, 2378 bibsource = {DBLP, http://dblp.uni-trier.de}, 2379 } 2380 2381 @InProceedings{ ji06document, 2382 author = {Xiang Ji and Wei Xu}, 2383 title = {Document clustering with prior knowledge}, 2384 booktitle = {Proc. SIGIR}, 2385 publisher = {ACM Press}, 2386 year = {2006}, 2387 isbn = {1-59593-369-7}, 2388 pages = {405--412}, 2389 location = {Seattle, WA}, 2390 doi = {doi.acm.org/10.1145/1148170.1148241}, 2391 address = {New York, NY}, 2392 } 2393 2394 @PhDThesis{ strehl02relationship, 2395 author = {Alexander Strehl}, 2396 title = {Relationship-based Clustering and Cluster Ensembles for 2397 High-dimensional Data Mining}, 2398 year = {2002}, 2399 month = {May}, 2400 school = {The University of Texas at Austin}, 2401 } 2402 2403 @InProceedings{ yang06near, 2404 author = {Hui Yang and Jamie Callan}, 2405 title = {Near-duplicate detection by instance-level constrained 2406 clustering}, 2407 booktitle = {Proc. SIGIR}, 2408 publisher = {ACM Press}, 2409 year = {2006}, 2410 isbn = {1-59593-369-7}, 2411 pages = {421--428}, 2412 location = {Seattle, Washington}, 2413 doi = {doi.acm.org/10.1145/1148170.1148243}, 2414 address = {New York, NY}, 2415 } 2416 2417 @Book{ salton75dynamic, 2418 author = {Gerard Salton}, 2419 title = {Dynamic information and library processing}, 2420 year = {1975}, 2421 isbn = {0132213257}, 2422 publisher = {Prentice Hall}, 2423 address = {Upper Saddle River, NJ}, 2424 } 2425 2426 @InProceedings{ liu04cluster, 2427 author = {Xiaoyong Liu and W. Bruce Croft}, 2428 title = {Cluster-based retrieval using language models}, 2429 booktitle = {Proc. SIGIR}, 2430 publisher = {ACM Press}, 2431 year = {2004}, 2432 isbn = {1-58113-881-4}, 2433 pages = {186--193}, 2434 location = {Sheffield}, 2435 doi = {doi.acm.org/10.1145/1008992.1009026}, 2436 address = {New York, NY}, 2437 } 2438 2439 @Article{ hearst06clustering, 2440 author = {Marti A. Hearst}, 2441 title = {Clustering versus faceted categories for information 2442 exploration}, 2443 publisher = {ACM Press}, 2444 journal = {CACM}, 2445 volume = {49}, 2446 number = {4}, 2447 year = {2006}, 2448 issn = {0001-0782}, 2449 pages = {59--61}, 2450 doi = {doi.acm.org/10.1145/1121949.1121983}, 2451 address = {New York, NY}, 2452 } 2453 2454 @InProceedings{ zamir99grouper, 2455 author = {Oren Zamir and Oren Etzioni}, 2456 title = {Grouper: {A} dynamic clustering interface to Web search 2457 results}, 2458 booktitle = {Proc. WWW}, 2459 year = {1999}, 2460 pages = {1361--1374}, 2461 location = {Toronto}, 2462 doi = {dx.doi.org/10.1016/S1389-1286(99)00054-7}, 2463 publisher = {Elsevier North-Holland}, 2464 address = {New York, NY}, 2465 } 2466 2467 @Article{ hubert85comparing, 2468 author = {Lawrence Hubert and Phipps Arabie}, 2469 journal = {Journal of Classification}, 2470 pages = {193--218}, 2471 title = {Comparing partitions}, 2472 volume = {2}, 2473 year = {1985}, 2474 } 2475 2476 @Article{ rand71objective, 2477 author = {William M. Rand}, 2478 journal = {Journal of the American Statistical Association}, 2479 pages = {846--850}, 2480 title = {Objective criteria for the evaluation of clustering 2481 methods}, 2482 volume = {66}, 2483 number = 336, 2484 year = {1971}, 2485 } 2486 2487 @InProceedings{ hamerly03kmeans, 2488 author = {Greg Hamerly and Charles Elkan}, 2489 title = {Learning the $k$ in $k$-means.}, 2490 booktitle = {Proc. NIPS}, 2491 year = {2003}, 2492 url = {books.nips.cc/papers/files/nips16/NIPS2003\_AA36.pdf} 2493 , 2494 bibsource = {DBLP, http://dblp.uni-trier.de}, 2495 } 2496 2497 @InProceedings{ vaithyanathan00modelbased, 2498 author = {Shivakumar Vaithyanathan and Byron Dom}, 2499 title = {Model-Based Hierarchical Clustering}, 2500 booktitle = {Proc. UAI}, 2501 year = {2000}, 2502 isbn = {1-55860-709-9}, 2503 pages = {599--608}, 2504 publisher = {Morgan Kaufmann}, 2505 address = {San Francisco, CA}, 2506 } 2507 2508 @InProceedings{ lewis96training, 2509 author = {David D. Lewis and Robert E. Schapire and James P. Callan 2510 and Ron Papka}, 2511 title = {Training algorithms for linear text classifiers}, 2512 booktitle = {Proc. SIGIR}, 2513 publisher = {ACM Press}, 2514 year = {1996}, 2515 isbn = {0-89791-792-8}, 2516 pages = {298--306}, 2517 location = {Zurich}, 2518 doi = {doi.acm.org/10.1145/243199.243277}, 2519 address = {New York, NY}, 2520 } 2521 2522 @Article{ dietterich95multiclass, 2523 author = {Thomas G. Dietterich and Ghulum Bakiri}, 2524 title = {Solving Multiclass Learning Problems via Error-Correcting 2525 Output Codes.}, 2526 journal = {Journal of Artificial Intelligence Research}, 2527 volume = {2}, 2528 year = {1995}, 2529 pages = {263-286}, 2530 bibsource = {DBLP, http://dblp.uni-trier.de}, 2531 } 2532 2533 @InProceedings{ kaki05findex, 2534 author = {Mika K{\"a}ki}, 2535 title = {Findex: {S}earch result categories help users when document 2536 ranking fails}, 2537 booktitle = {Proc. SIGCHI}, 2538 publisher = {ACM Press}, 2539 year = {2005}, 2540 isbn = {1-58113-998-5}, 2541 pages = {131--140}, 2542 doi = {doi.acm.org/10.1145/1054972.1054991}, 2543 address = {New York, NY}, 2544 } 2545 2546 @Article{ allwein00reducing, 2547 author = {Erin L. Allwein and Robert E. Schapire and Yoram Singer}, 2548 title = {Reducing Multiclass to Binary: {A} Unifying Approach for 2549 Margin Classifiers.}, 2550 journal = {JMLR}, 2551 volume = {1}, 2552 year = {2000}, 2553 pages = {113--141}, 2554 url = {www.jmlr.org/papers/volume1/allwein00a/allwein00a.pdf} 2555 , 2556 bibsource = {DBLP, http://dblp.uni-trier.de}, 2557 } 2558 2559 @InProceedings{ lewis98naive, 2560 author = {David D. Lewis}, 2561 title = {Naive ({B}ayes) at Forty: {T}he Independence Assumption in 2562 Information Retrieval}, 2563 booktitle = {Proc. ECML}, 2564 year = {1998}, 2565 isbn = {3-540-64417-2}, 2566 pages = {4--15}, 2567 publisher = {Springer}, 2568 address = {London, UK}, 2569 } 2570 2571 @InProceedings{ ng01discriminative, 2572 author = {Andrew Y. Ng and Michael I. Jordan}, 2573 title = {On Discriminative vs. Generative Classifiers: {A} comparison 2574 of logistic regression and naive {B}ayes.}, 2575 booktitle = {Proc. NIPS}, 2576 year = {2001}, 2577 pages = {841-848}, 2578 url = {www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA28.ps.gz} 2579 , 2580 bibsource = {DBLP, http://dblp.uni-trier.de}, 2581 } 2582 2583 @Book{ snedecor89, 2584 title = {Statistical methods}, 2585 author = {George Waddel Snedecor and William G. Cochran}, 2586 year = 1989, 2587 publisher = {Iowa State University Press}, 2588 } 2589 2590 @Book{ harold04xml, 2591 author = {Elliotte Rusty Harold and Scott W. Means}, 2592 howpublished = {Paperback}, 2593 isbn = {0596007647}, 2594 keywords = {xml}, 2595 month = {October}, 2596 publisher = {O'Reilly}, 2597 title = {{XML} in a Nutshell}, 2598 edition = {3rd}, 2599 year = {2004}, 2600 } 2601 2602 @InProceedings{ mass02juruxml, 2603 author = {Yosi Mass and Matan Mandelbrod and Einat Amitay and David 2604 Carmel and Yo{\"e}lle S. Maarek and Aya Soffer}, 2605 title = {{JuruXML} -- {A}n {XML} Retrieval System at {INEX'02}}, 2606 booktitle = {Proc. INEX}, 2607 year = {2003}, 2608 pages = {73-80}, 2609 crossref = {fuhr03inex}, 2610 bibsource = {DBLP, http://dblp.uni-trier.de}, 2611 } 2612 2613 @InProceedings{ govert03overview, 2614 author = {Norbert G{\"o}vert and Gabriella Kazai}, 2615 title = {Overview of the {INitiative for the Evaluation of {XML}} 2616 retrieval ({INEX}) 2002}, 2617 pages = {1--17}, 2618 year = {2003}, 2619 crossref = {fuhr03inex}, 2620 entrydate = 20030226, 2621 key = {Goevert/Kazai:03}, 2622 } 2623 2624 @InProceedings{ carmel03fragments, 2625 author = {David Carmel and Yoelle S. Maarek and Matan Mandelbrod and 2626 Yosi Mass and Aya Soffer}, 2627 title = {Searching {XML} documents via {XML} fragments}, 2628 booktitle = {Proc. SIGIR}, 2629 publisher = {ACM Press}, 2630 year = {2003}, 2631 isbn = {1-58113-646-3}, 2632 pages = {151--158}, 2633 location = {Toronto}, 2634 doi = {doi.acm.org/10.1145/860435.860464}, 2635 address = {New York, NY}, 2636 } 2637 2638 @InProceedings{anh06pruned, 2639 author = {Vo Ngoc Anh and Alistair Moffat}, 2640 title = {Pruned query evaluation using pre-computed impacts}, 2641 booktitle = {Proc. SIGIR}, 2642 publisher = {ACM Press}, 2643 year = {2006}, 2644 isbn = {1-59593-369-7}, 2645 pages = {372--379}, 2646 location = {Seattle, WA}, 2647 doi = {doi.acm.org/10.1145/1148170.1148235}, 2648 address = {New York, NY}, 2649 } 2650 2651 @Article{ moura00fast, 2652 author = {de Moura, Edleno Silva and Gonzalo Navarro and Nivio 2653 Ziviani and Ricardo Baeza-Yates}, 2654 title = {Fast and flexible word searching on compressed text}, 2655 publisher = {ACM Press}, 2656 journal = {TOIS}, 2657 volume = {18}, 2658 number = {2}, 2659 year = {2000}, 2660 issn = {1046-8188}, 2661 pages = {113--139}, 2662 doi = {doi.acm.org/10.1145/348751.348754}, 2663 address = {New York, NY}, 2664 } 2665 2666 @InProceedings{persin94filtered, 2667 author = {Michael Persin}, 2668 title = {Document filtering for fast ranking}, 2669 booktitle = {Proc. SIGIR}, 2670 publisher = {ACM Press}, 2671 year = {1994}, 2672 pages = {339--348}, 2673 address = {New York, NY}, 2674 } 2675 2676 @Article{ persin96filtered, 2677 author = {Michael Persin and Justin Zobel and Ron Sacks-Davis}, 2678 title = {Filtered document retrieval with frequency-sorted indexes}, 2679 journal = {JASIS}, 2680 volume = {47}, 2681 number = {10}, 2682 year = {1996}, 2683 issn = {0002-8231}, 2684 pages = {749--764}, 2685 publisher = {John Wiley \& Sons}, 2686 address = {New York, NY}, 2687 } 2688 2689 @InProceedings{ scholer02inverted, 2690 author = {Falk Scholer and Hugh E. Williams and John Yiannis and 2691 Justin Zobel}, 2692 title = {Compression of inverted indexes for fast query 2693 evaluation}, 2694 booktitle = {Proc. SIGIR}, 2695 publisher = {ACM Press}, 2696 year = {2002}, 2697 isbn = {1-58113-561-0}, 2698 pages = {222--229}, 2699 location = {Tampere, Finland}, 2700 doi = {doi.acm.org/10.1145/564376.564416}, 2701 address = {New York, NY}, 2702 } 2703 2704 @Article{anh05invertedindex, 2705 author = {Vo Ngoc Anh and Alistair Moffat}, 2706 title = {Inverted Index Compression Using Word-Aligned Binary 2707 Codes}, 2708 journal = {IR}, 2709 volume = {8}, 2710 number = {1}, 2711 year = {2005}, 2712 issn = {1386-4564}, 2713 pages = {151--166}, 2714 doi = {dx.doi.org/10.1023/B:INRT.0000048490.99518.5c}, 2715 publisher = {Kluwer}, 2716 address = {Hingham, MA}, 2717 } 2718 2719 @Article{ moffat96selfindexing, 2720 author = {Alistair Moffat and Justin Zobel}, 2721 title = {Self-indexing inverted files for fast text retrieval}, 2722 publisher = {ACM Press}, 2723 journal = {TOIS}, 2724 volume = {14}, 2725 number = {4}, 2726 year = {1996}, 2727 issn = {1046-8188}, 2728 pages = {349--379}, 2729 address = {New York, NY}, 2730 } 2731 2732 @InProceedings{anh01termination, 2733 author = {Vo Ngoc Anh and Owen de Kretser and Alistair Moffat}, 2734 title = {Vector-space ranking with effective early termination}, 2735 booktitle = {Proc. SIGIR}, 2736 publisher = {ACM Press}, 2737 year = {2001}, 2738 isbn = {1-58113-331-6}, 2739 pages = {35--42}, 2740 location = {New Orleans, LA}, 2741 address = {New York, NY}, 2742 } 2743 2744 @InProceedings{ dean04mapreduce, 2745 author = {Jeffrey Dean and Sanjay Ghemawat}, 2746 title = {{MapReduce}: {S}implified Data Processing on Large Clusters}, 2747 booktitle = {Proc. Symposium on Operating System Design and Implementation}, 2748 address = {San Francisco, CA}, 2749 year = 2004, 2750 } 2751 2752 @Article{ harman90retrieving, 2753 author = {Donna Harman and Gerald Candela}, 2754 title = {Retrieving records from a gigabyte of text on a 2755 minicomputer using statistical ranking}, 2756 journal = {JASIS}, 2757 volume = 41, 2758 number = 8, 2759 year = 1990, 2760 pages = "581--589", 2761 } 2762 2763 @InProceedings{ steinbach00comparison, 2764 author = "Michael Steinbach and George Karypis and Vipin Kumar", 2765 title = "A comparison of document clustering techniques", 2766 booktitle = {KDD Workshop on Text Mining}, 2767 year = "2000", 2768 } 2769 2770 @InProceedings{ glover02structure, 2771 author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve 2772 Lawrence and David M. Pennock and Gary W. Flake}, 2773 title = {Using web structure for classifying and describing web 2774 pages}, 2775 booktitle = {Proc. WWW}, 2776 publisher = {ACM Press}, 2777 year = {2002}, 2778 isbn = {1-58113-449-5}, 2779 pages = {562--569}, 2780 location = {Honolulu, HI}, 2781 doi = {doi.acm.org/10.1145/511446.511520}, 2782 address = {New York, NY}, 2783 } 2784 2785 @Article{ jain99data, 2786 author = {Anil Jain and M. Narasimha Murty and Patrick Flynn}, 2787 title = {Data clustering: {A} review}, 2788 publisher = {ACM Press}, 2789 journal = {ACM Computing Surveys}, 2790 volume = {31}, 2791 number = {3}, 2792 year = {1999}, 2793 issn = {0360-0300}, 2794 pages = {264--323}, 2795 address = {New York, NY}, 2796 } 2797 2798 @Article{ ward63hierarchical, 2799 author = {J. H. {Ward Jr.}}, 2800 title = {Hierarchical grouping to optimize an objective function}, 2801 journal = {Journal of the American Statistical Association}, 2802 volume = 58, 2803 pages = {236-244}, 2804 year = 1963, 2805 } 2806 2807 @InProceedings{ elhamdouchi86hierarchic, 2808 author = {Abdelmoula El-Hamdouchi and Peter Willett}, 2809 title = {Hierarchic document classification using {Ward}'s clustering 2810 method}, 2811 booktitle = {Proc. SIGIR}, 2812 publisher = {ACM Press}, 2813 year = {1986}, 2814 isbn = {0-89791-187-3}, 2815 pages = {149--156}, 2816 location = {Pisa}, 2817 doi = {doi.acm.org/10.1145/253168.253200}, 2818 address = {New York, NY}, 2819 } 2820 2821 @Article{ murtagh83survey, 2822 author = {Fionn Murtagh}, 2823 title = {A Survey of Recent Advances in Hierarchical Clustering 2824 Algorithms.}, 2825 journal = {Computer Journal}, 2826 volume = {26}, 2827 number = {4}, 2828 year = {1983}, 2829 pages = {354--359}, 2830 } 2831 2832 @Book{ cormen90algorithms, 2833 author = "Thomas H. Cormen and Charles Eric Leiserson and Ronald L. 2834 Rivest", 2835 title = "Introduction to Algorithms", 2836 publisher = {MIT Press}, 2837 address = {Cambridge MA}, 2838 pages = "xvii + 1028", 2839 year = "1990", 2840 isbn = "0-262-03141-8, 0-07-013143-0 (McGraw Hill)", 2841 isbn-13 = "978-0-262-03141-7, 978-0-07-013143-9 (McGraw Hill)", 2842 lccn = "QA76.6 .C662 1990", 2843 } 2844 2845 @Article{ day84efficient, 2846 author = {William H. Day and Herbert Edelsbrunner}, 2847 year = 1984, 2848 title = {Efficient Algorithms for Agglomerative Hierarchical 2849 Clustering Methods}, 2850 journal = {Journal of Classification}, 2851 volume = 1, 2852 pages = {1-24}, 2853 } 2854 2855 @Article{ king67stepwise, 2856 author = {Benjamin King}, 2857 title = {Step-wise clustering procedures}, 2858 journal = {Journal of the American Statistical Association}, 2859 volume = {69}, 2860 year = {1967}, 2861 pages = {86-101}, 2862 } 2863 2864 @Book{ sneath73numerical, 2865 author = "Peter H.A. Sneath and Robert R. Sokal", 2866 title = "Numerical Taxonomy: {T}he Principles and Practice of 2867 Numerical Classification", 2868 publisher = "W.H. Freeman", 2869 address = "San Francisco", 2870 year = 1973, 2871 isbn = "0 7167 0697 0", 2872 } 2873 2874 @TechReport{ voorhees85effectiveness, 2875 author = "Ellen M. Voorhees", 2876 title = {The Effectiveness and Efficiency of Agglomerative 2877 Hierarchic Clustering in Document Retrieval}, 2878 institution = "Cornell", 2879 year = 1985, 2880 number = {TR 85-705}, 2881 } 2882 2883 @Unpublished{ popescul00automatic, 2884 title = {Automatic Labeling of Document Clusters}, 2885 author = {Alexandrin Popescul and Lyle H. Ungar}, 2886 note = {Unpublished \textsc{ms}, U. Pennsylvania}, 2887 year = 2000, 2888 url = {http://www.cis.upenn.edu/~popescul/Publications/popescul00labeling.pdf} 2889 } 2890 2891 @InProceedings{ mckeown95generating, 2892 author = {Kathleen McKeown and Dragomir R. Radev}, 2893 title = {Generating summaries of multiple news articles}, 2894 booktitle = {Proc. SIGIR}, 2895 publisher = {ACM Press}, 2896 year = {1995}, 2897 isbn = {0-89791-714-6}, 2898 pages = {74--82}, 2899 location = {Seattle, WA}, 2900 doi = {doi.acm.org/10.1145/215206.215334}, 2901 address = {New York, NY}, 2902 } 2903 2904 @InProceedings{ glover02inferring, 2905 author = {Eric Glover and David M. Pennock and Steve Lawrence and 2906 Robert Krovetz}, 2907 title = {Inferring hierarchical descriptions}, 2908 booktitle = {Proc. CIKM}, 2909 publisher = {ACM Press}, 2910 year = {2002}, 2911 isbn = {1-58113-492-4}, 2912 pages = {507--514}, 2913 location = {McLean, VA}, 2914 doi = {doi.acm.org/10.1145/584792.584876}, 2915 address = {New York, NY}, 2916 } 2917 2918 @InCollection{ darrell06locality, 2919 title = {Locality-sensitive hashing using stable distributions}, 2920 booktitle = {Nearest Neighbor Methods in Learning and Vision: {T}heory and Practice}, 2921 author = {Alexandr Andoni and Mayur Datar and Nicole Immorlica and Piotr Indyk and Vahab Mirrokni}, 2922 editors = {T. Darrell and P. Indyk and G. Shakhnarovich}, 2923 publisher = {MIT Press}, 2924 year = 2006, 2925 } 2926 2927 @Article{ cover67nearest, 2928 author = {Thomas M. Cover and Peter E. Hart}, 2929 title = {Nearest neighbor pattern classification}, 2930 journal = {IEEE Transactions on Information Theory}, 2931 volume = {13}, 2932 number = {1}, 2933 year = {1967}, 2934 pages = {21-27}, 2935 } 2936 2937 @InProceedings{ yang94expert, 2938 author = {Yiming Yang}, 2939 title = {Expert network: {E}ffective and efficient learning from 2940 human decisions in text categorization and retrieval}, 2941 booktitle = {Proc. SIGIR}, 2942 publisher = {ACM Press}, 2943 pages = {13--22}, 2944 year = 1994, 2945 } 2946 2947 @InProceedings{ turtle94boolean, 2948 author = {Howard Turtle}, 2949 year = {1994}, 2950 title = {Natural language vs. {B}oolean query evaluation: {A} 2951 comparison of retrieval performance}, 2952 pages = {212--220}, 2953 booktitle = {Proc. SIGIR}, 2954 publisher = {ACM Press}, 2955 } 2956 2957 @InProceedings{ lita03truecasing, 2958 author = {Lucian Vlad Lita and Abe Ittycheriah and Salim Roukos and 2959 Nanda Kambhatla}, 2960 title = {{tRuEcasIng}}, 2961 booktitle = {Proc. ACL}, 2962 year = 2003, 2963 pages = {152--159}, 2964 } 2965 2966 @InProceedings{ boldi05skiplists, 2967 author = {Paolo Boldi and Sebastiano Vigna}, 2968 title = {Compressed perfect embedded skip lists for quick 2969 inverted-index lookups}, 2970 booktitle = {Proc. SPIRE}, 2971 publisher = {Springer}, 2972 year = 2005, 2973 } 2974 % series = {Lecture Notes in Computer Science}, 2975 2976 @Book{ manning99foundations, 2977 author = {Christopher D. Manning and Hinrich Sch{\"u}tze}, 2978 title = {Foundations of Statistical Natural Language Processing}, 2979 year = 1999, 2980 address = {Cambridge, MA}, 2981 publisher = {MIT Press}, 2982 } 2983 2984 @InProceedings{ bahle02phrase, 2985 author = {Dirk Bahle and Hugh E. Williams and Justin Zobel}, 2986 year = 2002, 2987 title = {Efficient Phrase Querying with an Auxiliary Index}, 2988 booktitle = {Proc. SIGIR}, 2989 publisher = {ACM Press}, 2990 pages = {215--221}, 2991 } 2992 2993 @Article{ williams04phrase, 2994 author = {Hugh E. Williams and Justin Zobel and Dirk Bahle}, 2995 year = 2004, 2996 title = {Fast Phrase Querying With Combined Indexes}, 2997 journal = {TOIS}, 2998 volume = {22}, 2999 number = {4}, 3000 pages = {573--594}, 3001 } 3002 3003 @InProceedings{ sproat03bakeoff, 3004 author = {Richard Sproat and Thomas Emerson}, 3005 year = 2003, 3006 title = {The First International {C}hinese Word Segmentation 3007 Bakeoff}, 3008 booktitle = {SIGHAN Workshop on Chinese Language Processing}, 3009 } 3010 3011 @Book{ witten99gigabytes, 3012 author = {Ian H. Witten and Alistair Moffat and Timothy C. Bell}, 3013 title = {Managing Gigabytes: {C}ompressing and Indexing Documents and 3014 Images}, 3015 publisher = {Morgan Kaufmann}, 3016 address = {San Francisco, CA}, 3017 year = 1999, 3018 edition = {2nd}, 3019 } 3020 3021 @InProceedings{ mccallum98comparison, 3022 author = {Andrew McCallum and Kamal Nigam}, 3023 title = {A Comparison of Event Models for {N}aive {B}ayes Text 3024 Classification}, 3025 year = 1998, 3026 booktitle = {AAAI/ICML Workshop on Learning for Text Categorization}, 3027 pages = {41--48}, 3028 } 3029 3030 @Article{ friedman97bias, 3031 author = {Jerome H. Friedman}, 3032 title = {On Bias, Variance, 0/1--Loss, and the 3033 Curse-of-Dimensionality}, 3034 journal = {Data Mining and Knowledge Discovery}, 3035 year = {1997}, 3036 volume = {1}, 3037 number = {1}, 3038 pages = {55--77}, 3039 annote = {Also, Technical Report, Stanford University, 1996}, 3040 } 3041 3042 @Book{ duda00pattern, 3043 author = {Richard O. Duda and Peter E. Hart and David G. Stork}, 3044 title = {Pattern Classification}, 3045 edition = {2nd}, 3046 year = {2000}, 3047 isbn = {0471056693}, 3048 publisher = {Wiley-Interscience}, 3049 } 3050 3051 @InProceedings{ croft78cluster, 3052 author = {W. Bruce Croft}, 3053 title = {A file organization for cluster-based retrieval}, 3054 booktitle = {Proc. SIGIR}, 3055 publisher = {ACM Press}, 3056 year = {1978}, 3057 pages = {65--82}, 3058 address = {New York, NY}, 3059 } 3060 3061 @Article{ sebastiani02automated, 3062 author = {Fabrizio Sebastiani}, 3063 title = {Machine Learning in Automated Text Categorization}, 3064 journal = {ACM Computing Surveys}, 3065 volume = 34, 3066 number = 1, 3067 pages = {1--47}, 3068 year = 2002, 3069 } 3070 3071 @InProceedings{ yang97selection, 3072 author = {Yiming Yang and Jan Pedersen}, 3073 title = {Feature selection in statistical learning of text 3074 categorization}, 3075 booktitle = {Proc. ICML}, 3076 year = 1997, 3077 } 3078 3079 @InProceedings{ li03loss, 3080 author = {Fan Li and Yiming Yang}, 3081 title = {A Loss Function Analysis for Classification Methods in 3082 Text Categorization.}, 3083 booktitle = {Proc. ICML}, 3084 year = {2003}, 3085 pages = {472-479}, 3086 } 3087 3088 @Book{ voorhees05experiment, 3089 editor = {Ellen M. Voorhees and Donna Harman}, 3090 title = {TREC: {E}xperiment and Evaluation in Information Retrieval}, 3091 publisher = {MIT Press}, 3092 year = 2005, 3093 } 3094 3095 @Article{ elias75universal, 3096 author = {Peter Elias}, 3097 title = {Universal Code word sets and representations of the 3098 integers}, 3099 journal = {IEEE Transactions on Information Theory}, 3100 volume = 21, 3101 number = 2, 3102 year = 1975, 3103 pages = {194--203}, 3104 } 3105 3106 @Article{ domingos97optimality, 3107 author = "Pedro Domingos and Michael J. Pazzani", 3108 title = "On the Optimality of the Simple {B}ayesian Classifier 3109 under Zero-One Loss", 3110 journal = {Machine Learning}, 3111 volume = "29", 3112 number = "2-3", 3113 pages = "103-130", 3114 year = "1997", 3115 url = "citeseer.ist.psu.edu/domingos97optimality.html", 3116 } 3117 3118 @Article{ zhang01text, 3119 author = "Tong Zhang and Frank J. Oles", 3120 title = "Text Categorization Based on Regularized Linear 3121 Classification Methods", 3122 journal = {IR}, 3123 volume = "4", 3124 number = "1", 3125 publisher = "Kluwer", 3126 pages = "5--31", 3127 year = "2001", 3128 url = "citeseer.ist.psu.edu/zhang00text.html", 3129 } 3130 3131 @Article{ lewis04benchmark, 3132 author = {David D. Lewis and Yiming Yang and Tony G. Rose and Fan 3133 Li}, 3134 title = {{RCV1}: {A} New Benchmark Collection for Text Categorization 3135 Research}, 3136 journal = {JMLR}, 3137 volume = {5}, 3138 year = {2004}, 3139 issn = {1533-7928}, 3140 pages = {361--397}, 3141 publisher = {MIT Press}, 3142 } 3143 3144 @Book{ joachims2002classify, 3145 author = {Thorsten Joachims}, 3146 title = {Learning to Classify Text Using Support Vector Machines}, 3147 publisher = {Kluwer}, 3148 year = 2002, 3149 } 3150 3151 @Book{ hastie2001elements, 3152 title = {The Elements of Statistical Learning: {D}ata Mining, 3153 Inference, and Prediction}, 3154 author = {Trevor Hastie and Robert Tibshirani and Jerome H. 3155 Friedman}, 3156 publisher = {Springer}, 3157 address = {New York}, 3158 year = 2001, 3159 } 3160 3161 @Book{ korfhage97, 3162 title = {Information Storage and Retrieval}, 3163 author = {Robert R. Korfhage}, 3164 year = 1997, 3165 publisher = {Wiley}, 3166 } 3167 3168 @Book{ baezayates99, 3169 title = {Modern Information Retrieval}, 3170 author = {Ricardo Baeza-Yates and Berthier Ribeiro-Neto}, 3171 publisher = {Addison Wesley}, 3172 address = {Harlow}, 3173 year = 1999, 3174 } 3175 3176 @Book{ chakrabarti02, 3177 author = "Soumen Chakrabarti", 3178 title = "Mining the Web: {A}nalysis of Hypertext and Semi Structured Data", 3179 publisher = {Morgan Kaufmann}, 3180 year = "2002", 3181 } 3182 3183 @InCollection{ rocchio71, 3184 author = {J. J. Rocchio}, 3185 title = {Relevance feedback in information retrieval}, 3186 crossref = {salton71smart}, 3187 year = 1971, 3188 pages = {313--323}, 3189 } 3190 3191 @InCollection{ salton71cluster, 3192 author = {Gerard Salton}, 3193 title = {Cluster search strategies and the optimization of 3194 retrieval effectiveness}, 3195 crossref = {salton71smart}, 3196 year = 1971, 3197 pages = {223--242}, 3198 } 3199 3200 @Book{ salton71smart, 3201 editor = {Gerard Salton}, 3202 title = {The {SMART} Retrieval System -- Experiments in Automatic Document Processing}, 3203 booktitle = {The {SMART} Retrieval System -- Experiments in Automatic Document Processing}, 3204 publisher = {Prentice Hall}, 3205 address = {Englewood Cliffs, NJ}, 3206 year = 1971, 3207 } 3208 3209 @Book{ zipf49human, 3210 author = {George Kingsley Zipf}, 3211 year = 1949, 3212 title = {Human Behavior and the Principle of Least Effort}, 3213 publisher = {Addison Wesley}, 3214 address = {Cambridge MA}, 3215 } 3216 3217 @Article{ dunning93accurate, 3218 author = {Ted Dunning}, 3219 title = {Accurate Methods for the Statistics of Surprise and 3220 Coincidence}, 3221 year = 1993, 3222 journal = {Computational Linguistics}, 3223 volume = 19, 3224 number = 1, 3225 pages = {61--74}, 3226 } 3227 3228 @Article{ jardine71hierarchic, 3229 author = {N. Jardine and Cornelis Joost {van~Rijsbergen}}, 3230 title = {The use of hierarchic clustering in information 3231 retrieval}, 3232 journal = {Information Storage and Retrieval}, 3233 volume = 7, 3234 pages = {217--240}, 3235 year = 1971, 3236 } 3237 3238 @InProceedings{ singhal97, 3239 author = {Amit Singhal and Mandar Mitra and Chris Buckley}, 3240 title = {Learning Routing Queries in a Query Zone}, 3241 year = 1997, 3242 booktitle = {Proc. SIGIR}, 3243 publisher = {ACM Press}, 3244 pages = {25--32}, 3245 } 3246 3247 @InProceedings{schutze97projections, 3248 author = {Hinrich Sch{\"u}tze and Craig Silverstein}, 3249 title = {Projections for Efficient Document Clustering}, 3250 year = 1997, 3251 booktitle = {Proc. SIGIR}, 3252 publisher = {ACM Press}, 3253 pages = {74--81}, 3254 } 3255 3256 @InProceedings{ voorhees85, 3257 author = {Ellen M. Voorhees}, 3258 title = {The cluster hypothesis revisited}, 3259 year = 1985, 3260 pages = {188--196}, 3261 booktitle = {Proc. SIGIR}, 3262 publisher = {ACM Press}, 3263 } 3264 3265 @InProceedings{ hp96, 3266 author = {Marti A. Hearst and Jan O. Pedersen}, 3267 title = {Reexamining the Cluster Hypothesis}, 3268 year = 1996, 3269 pages = {76--84}, 3270 booktitle = {Proc. SIGIR}, 3271 publisher = {ACM Press}, 3272 address = {Zurich}, 3273 } 3274 3275 @InProceedings{ dum-95, 3276 author = "Susan T. Dumais", 3277 title = "Latent Semantic Indexing ({LSI}): {TREC}-3 Report", 3278 pages = {219--230}, 3279 year = 1995, 3280 booktitle = {Proc. TREC}, 3281 } 3282 3283 @inproceedings{buckley95new, 3284 author = {Chris Buckley and 3285 Amit Singhal and 3286 Mandar Mitra}, 3287 title = {New Retrieval Approaches Using {SMART}: {TREC} 4}, 3288 booktitle = {Proc. TREC}, 3289 year = {1995}, 3290 ee = {trec.nist.gov/pubs/trec4/papers/Cornell_trec4.ps.gz}, 3291 bibsource = {DBLP, http://dblp.uni-trier.de}, 3292 } 3293 3294 @inproceedings{vanrijsbergen89towards, 3295 author = {Cornelis Joost {van~Rijsbergen}}, 3296 title = {Towards an information logic}, 3297 booktitle = {Proc. SIGIR}, 3298 year = {1989}, 3299 isbn = {0-89791-321-3}, 3300 pages = {77--86}, 3301 location = {Cambridge, Massachusetts, United States}, 3302 doi = {doi.acm.org/10.1145/75334.75344}, 3303 publisher = {ACM Press}, 3304 address = {New York, NY, USA}, 3305 } 3306 3307 @Book{ rij79, 3308 author = {Cornelis Joost {van~Rijsbergen}}, 3309 title = {Information Retrieval}, 3310 year = 1979, 3311 edition = {2nd}, 3312 address = {London}, 3313 publisher = {Butterworths}, 3314 } 3315 3316 @Book{ roget, 3317 author = {P. M. Roget}, 3318 title = {Roget's International Thesaurus}, 3319 publisher = {Thomas Y. Crowell}, 3320 year = {1946}, 3321 address = {New York}, 3322 } 3323 3324 @InProceedings{ ckp93, 3325 author = "Douglas R. Cutting and David R. Karger and Jan O. 3326 Pedersen", 3327 title = "Constant Interaction-Time {S}catter/{G}ather Browsing of 3328 Very Large Document Collections", 3329 booktitle = {Proc. SIGIR}, 3330 publisher = {ACM Press}, 3331 year = 1993, 3332 pages = {126--134}, 3333 } 3334 3335 @InProceedings{ cutting92scattergather, 3336 author = {Douglas R. Cutting and Jan O. Pedersen and David Karger 3337 and John W. Tukey}, 3338 title = {{Scatter/Gather}: {A} Cluster-based Approach to Browsing Large 3339 Document Collections}, 3340 booktitle = {Proc. SIGIR}, 3341 publisher = {ACM Press}, 3342 year = 1992, 3343 pages = {318--329}, 3344 } 3345 3346 @InProceedings{ qf93, 3347 author = {Yonggang Qiu and H.P. Frei}, 3348 title = {Concept Based Query Expansion}, 3349 booktitle = {Proc. SIGIR}, 3350 publisher = {ACM Press}, 3351 year = 1993, 3352 pages = {160--169}, 3353 } 3354 3355 @Article{ turtle91, 3356 author = {Howard Turtle and W. Bruce Croft}, 3357 title = {Evaluation of an Inference Network-Based Retrieval Model}, 3358 year = 1991, 3359 journal = {TOIS}, 3360 volume = 9, 3361 number = 3, 3362 pages = {187--222}, 3363 } 3364 3365 @InProceedings{ turtle89, 3366 author = {Howard Turtle and W. Bruce Croft}, 3367 title = {Inference networks for document retrieval}, 3368 booktitle = {Proc. SIGIR}, 3369 publisher = {ACM Press}, 3370 pages = {1--24}, 3371 year = 1989, 3372 } 3373 3374 @InProceedings{ dumais93, 3375 author = "Susan T. Dumais", 3376 title = "Latent Semantic Indexing ({LSI}) and {TREC-2}", 3377 booktitle = {Proc. TREC}, 3378 pages = "105--115", 3379 year = 1993, 3380 } 3381 3382 @Article{ dee90, 3383 author = {Scott Deerwester and Susan T. Dumais and George W. Furnas 3384 and Thomas K. Landauer and Richard Harshman}, 3385 year = 1990, 3386 title = "Indexing by latent semantic analysis", 3387 journal = {JASIS}, 3388 volume = 41, 3389 number = 6, 3390 pages = "391--407", 3391 } 3392 3393 @Article{ robertson76relevance, 3394 author = {Stephen E. Robertson and Karen Sp{\"a}rck Jones}, 3395 year = 1976, 3396 title = {Relevance Weighting of Search Terms}, 3397 journal = {JASIS}, 3398 volume = 27, 3399 pages = {129--146}, 3400 } 3401 3402 @Article{ spa72, 3403 author = {Sp{\"a}rck Jones, Karen}, 3404 year = 1972, 3405 title = {A statistical interpretation of term specificity and its 3406 application in retrieval}, 3407 journal = {Journal of Documentation}, 3408 volume = 28, 3409 number = 1, 3410 pages = {11--21}, 3411 } 3412 3413 @Article{ sb90, 3414 author = {Gerard Salton and Chris Buckley}, 3415 year = 1990, 3416 title = {Improving Retrieval Performance by Relevance Feedback}, 3417 journal = {JASIS}, 3418 volume = 41, 3419 number = 4, 3420 pages = {288--297}, 3421 } 3422 3423 @Article{ dlr77, 3424 author = {A.P. Dempster and N.M. Laird and D.B. Rubin}, 3425 year = 1977, 3426 title = {Maximum likelihood from incomplete data via the {EM} 3427 algorithm}, 3428 journal = {Journal of the Royal Statistical Society Series~B}, 3429 volume = 39, 3430 pages = {1--38}, 3431 } 3432 3433 @Book{ jain88algorithms, 3434 author = {Anil K. Jain and Richard C. Dubes}, 3435 title = {Algorithms for Clustering Data}, 3436 address = {Englewood Cliffs, NJ}, 3437 publisher = {Prentice Hall}, 3438 year = 1988, 3439 } 3440 3441 @InProceedings{ shp95, 3442 author = {Hinrich Sch{\"u}tze and David A. Hull and Jan O. Pedersen}, 3443 title = {A Comparison of Classifiers and Document Representations 3444 for the Routing Problem}, 3445 year = 1995, 3446 booktitle = {Proc. SIGIR}, 3447 publisher = {ACM Press}, 3448 pages = {229--237}, 3449 } 3450 3451 @InProceedings{ kupiec95, 3452 author = {Julian Kupiec and Jan Pedersen and Francine Chen}, 3453 title = {A Trainable Document Summarizer}, 3454 year = 1995, 3455 booktitle = {Proc. SIGIR}, 3456 publisher = {ACM Press}, 3457 pages = {68--73}, 3458 } 3459 3460 @InProceedings{ lewis95, 3461 author = {David D. Lewis}, 3462 title = {Evaluating and Optimizing Autonomous Text Classification 3463 Systems}, 3464 year = 1995, 3465 booktitle = {Proc. SIGIR}, 3466 publisher = {ACM Press}, 3467 } 3468 3469 @article{fuhr94b, 3470 author = {Norbert Fuhr and Ulrich Pfeifer}, 3471 title = {Probabilistic information retrieval as a combination of abstraction, inductive learning, and probabilistic assumptions}, 3472 journal = {TOIS}, 3473 volume = {12}, 3474 number = {1}, 3475 year = {1994}, 3476 issn = {1046-8188}, 3477 pages = {92--115}, 3478 doi = {doi.acm.org/10.1145/174608.174612}, 3479 publisher = {ACM Press}, 3480 address = {New York, NY, USA}, 3481 } 3482 3483 @InProceedings{ccg94, 3484 author = {William S. Cooper and Aitao Chen and Fredric C. Gey}, 3485 title = {Full Text Retrieval based on Probabilistic Equations with 3486 Coefficients fitted by Logistic Regression}, 3487 year = 1994, 3488 pages = {57--66}, 3489 booktitle = {Proc. TREC}, 3490 } 3491 3492 @Article{ fuhr89, 3493 author = {Norbert Fuhr}, 3494 title = {Optimum Polynomial Retrieval Functions Based on the 3495 Probability Ranking Principle}, 3496 journal = {TOIS}, 3497 volume = 7, 3498 number = 3, 3499 pages = {183--204}, 3500 year = 1989, 3501 } 3502 3503 @Article{ saracevic88users, 3504 author = {Saracevic, Tefko and Kantor, Paul}, 3505 year = 1988, 3506 title = {A study of information seeking and retrieving. {II}: {U}sers, 3507 questions and effectiveness}, 3508 journal = {JASIS}, 3509 volume = 39, 3510 pages = {177--196}, 3511 } 3512 3513 @Article{ saracevic88, 3514 author = "Tefko Saracevic and Paul Kantor", 3515 title = "A Study of Information Seeking and Retrieving. {III}: 3516 Searchers, Searches, Overlap", 3517 journal = {JASIS}, 3518 volume = 39, 3519 number = 3, 3520 pages = "197--216", 3521 year = 1996, 3522 } 3523 3524 @InProceedings{ hull93using, 3525 author = "David Hull", 3526 title = "Using Statistical Testing in the Evaluation of Retrieval 3527 Performance", 3528 booktitle = {Proc. SIGIR}, 3529 publisher = {ACM Press}, 3530 pages = "329--338", 3531 year = 1993, 3532 } 3533 3534 @Article{ schuetze98, 3535 author = {Hinrich Sch{\"u}tze}, 3536 title = {Automatic Word Sense Discrimination}, 3537 year = 1998, 3538 journal = {Computational Linguistics}, 3539 volume = 24, 3540 number = 1, 3541 pages = {97--124}, 3542 } 3543 3544 @Book{ mitchell97machine, 3545 author = {Tom M. Mitchell}, 3546 title = {Machine Learning}, 3547 publisher = {McGraw Hill}, 3548 address = {New York}, 3549 year = 1997, 3550 } 3551 3552 @Article{ croftharper79, 3553 author = {W. Bruce Croft and David J. Harper}, 3554 year = 1979, 3555 title = {Using Probabilistic Models of Document Retrieval Without 3556 Relevance Information}, 3557 journal = {Journal of Documentation}, 3558 volume = 35, 3559 number = 4, 3560 pages = {285--295}, 3561 } 3562 3563 @Article{ moffatzobel98, 3564 author = {Alistair Moffat and Justin Zobel}, 3565 title = {Exploring the Similarity Space}, 3566 journal = {SIGIR Forum}, 3567 volume = 32, 3568 number = 1, 3569 year = 1998, 3570 } 3571 3572 @InCollection{ rasmussen92, 3573 author = {Edie Rasmussen}, 3574 year = 1992, 3575 title = {Clustering Algorithms}, 3576 pages = {419--442}, 3577 crossref = {frakes92information} 3578 } 3579 3580 @book{ frakes92information, 3581 editor = {William B. Frakes and Ricardo Baeza-Yates}, 3582 year = 1992, 3583 title = {Information Retrieval: {D}ata Structures and Algorithms}, 3584 publisher = {Prentice Hall}, 3585 address = {Englewood Cliffs, NJ}, 3586 } 3587 3588 @Book{ kaufman90finding, 3589 title = {Finding groups in data}, 3590 year = 1990, 3591 address = {New York}, 3592 author = {Leonard Kaufman and Peter J. Rousseeuw}, 3593 publisher = {Wiley}, 3594 } 3595 3596 @Article{ yang99evaluation, 3597 author = {Yiming Yang}, 3598 title = {An Evaluation of Statistical Approaches to Text 3599 Categorization}, 3600 year = 1999, 3601 journal = {IR}, 3602 volume = {1}, 3603 pages = {69--90}, 3604 } 3605 3606 @InProceedings{ yang99re-examination, 3607 author = {Yiming Yang and Xin Liu}, 3608 booktitle = {Proc. SIGIR}, 3609 publisher = {ACM Press}, 3610 title = {A re-examination of text categorization methods}, 3611 year = 1999, 3612 pages = {42--49}, 3613 } 3614 3615 @Book{ minskypapert88, 3616 editor = {Marvin Lee Minsky and Seymour Papert}, 3617 title = {Perceptrons: {A}n introduction to computational geometry}, 3618 address = {Cambridge, MA}, 3619 publisher = {MIT Press}, 3620 year = 1988, 3621 note = {Expanded edition.}, 3622 } 3623 3624 @InProceedings{ lewis94comparison, 3625 author = {David D. Lewis and Marc Ringuette}, 3626 title = {A comparison of two learning algorithms for text 3627 categorization}, 3628 booktitle = {Proc. SDAIR}, 3629 year = "1994", 3630 address = "Las Vegas, NV", 3631 pages = {81--93}, 3632 } 3633 3634 @Article{ berrydumais95, 3635 author = {Michael W. Berry and Susan T. Dumais and Gavin W. O'Brien}, 3636 journal = {SIAM Review}, 3637 volume = 37, 3638 number = 4, 3639 pages = {573--595}, 3640 title = {Using linear algebra for intelligent information retrieval}, 3641 year = {1995}, 3642 } 3643 3644 @Article{ apte94automated, 3645 title = {Automated Learning of Decision Rules for Text 3646 Categorization}, 3647 author = {Chidanand Apt{\'e} and Fred Damerau and Sholom M. Weiss}, 3648 journal = {TOIS}, 3649 volume = 12, 3650 number = 1, 3651 year = 1994, 3652 pages = {233--251}, 3653 } 3654 3655 @Article{ sproat96segmentation, 3656 author = {Richard Sproat and William Gale and Chilin Shih and Nancy 3657 Chang}, 3658 title = {A stochastic finite-state word-segmentation algorithm for 3659 {C}hinese}, 3660 journal = {Computational Linguistics}, 3661 year = 1996, 3662 pages = {377--404}, 3663 volume = 22, 3664 number = 3, 3665 } 3666 3667 @InProceedings{ tseng05segmentation, 3668 author = {Huihsin Tseng and Pichuan Chang and Galen Andrew and 3669 Daniel Jurafsky and Christopher Manning}, 3670 year = 2005, 3671 title = {A Conditional Random Field Word Segmenter}, 3672 booktitle = {SIGHAN Workshop on Chinese Language Processing}, 3673 } 3674 3675 @Book{ lunde98cjkv, 3676 author = {Ken Lunde}, 3677 title = {{CJKV} Information Processing}, 3678 year = 1998, 3679 publisher = {O'Reilly}, 3680 } 3681 3682 @Article{ porter80stripping, 3683 author = {Martin F. Porter}, 3684 title = {An algorithm for suffix stripping}, 3685 year = 1980, 3686 journal = {Program}, 3687 volume = 14, 3688 number = 3, 3689 pages = {130--137}, 3690 } 3691 3692 @Article{ lovins68stemming, 3693 author = {Julie Beth Lovins}, 3694 title = {Development of a stemming algorithm}, 3695 year = 1968, 3696 journal = {Translation and Computational Linguistics}, 3697 volume = 11, 3698 number = 1, 3699 pages = {22--31}, 3700 } 3701 3702 @Article{ paice90stemmer, 3703 author = {Paice, Chris D.}, 3704 title = {Another Stemmer}, 3705 journal = {SIGIR Forum}, 3706 volume = 24, 3707 number = 3, 3708 pages = {56--61}, 3709 year = 1990, 3710 } 3711 3712 @Article{ hull96stemming, 3713 author = {David Hull}, 3714 title = {Stemming algorithms -- {A} case study for detailed 3715 evaluation}, 3716 journal = {JASIS}, 3717 volume = 47, 3718 number = 1, 3719 pages = {70--84}, 3720 year = 1996, 3721 } 3722 3723 @Book{ salton89automatic, 3724 author = {Gerard Salton}, 3725 title = {Automatic Text Processing: {T}he Transformation, Analysis, 3726 and Retrieval of Information by Computer}, 3727 publisher = {Addison Wesley}, 3728 address = {Reading, MA}, 3729 year = {1989}, 3730 } 3731 3732 @Article{ harman91suffixing, 3733 author = {Donna Harman}, 3734 year = 1991, 3735 title = {How effective is suffixing?}, 3736 journal = {JASIS}, 3737 volume = 42, 3738 pages = {7--15}, 3739 } 3740 3741 @PhDThesis{ krovetz95disambiguation, 3742 author = {Bob Krovetz}, 3743 year = 1995, 3744 title = {Word sense disambiguation for large text databases}, 3745 school = {University of Massachusetts Amherst}, 3746 } 3747 3748 @InProceedings{ cleverdon91cranfield, 3749 author = {Cyril W. Cleverdon}, 3750 title = {The significance of the {C}ranfield tests on index 3751 languages}, 3752 booktitle = {Proc. SIGIR}, 3753 publisher = {ACM Press}, 3754 pages = {3--12}, 3755 year = 1991, 3756 } 3757 3758 @InProceedings{ allan05hard, 3759 author = {James Allan}, 3760 title = {{HARD} Track Overview in {TREC} 2005: {H}igh Accuracy 3761 Retrieval from Documents}, 3762 year = 2005, 3763 booktitle = {Proc. TREC}, 3764 } 3765 3766 @Article{ carletta96kappa, 3767 author = {Jean Carletta}, 3768 year = 1996, 3769 title = {Assessing Agreement on Classification Tasks: {T}he Kappa 3770 Statistic}, 3771 journal = {Computational Linguistics}, 3772 volume = {22}, 3773 pages = {249--254}, 3774 } 3775 3776 @Book{ krippendorff03content, 3777 author = {Klaus Krippendorff}, 3778 title = {Content Analysis: {A}n Introduction to its Methodology}, 3779 year = 2003, 3780 publisher = {Sage}, 3781 } 3782 3783 3784 @Article{ lombard02content, 3785 author = { Matthew Lombard 3786 and Cheryl C. Bracken and Jennifer Snyder-Duch}, 3787 year = 2002, 3788 title = {Content analysis in mass communication: {A}ssessment and 3789 reporting of intercoder reliability}, 3790 journal = {Human Communication Research}, 3791 volume = 28, 3792 pages = {587--604}, 3793 } 3794 3795 @InProceedings{ hersh94ohsumed, 3796 author = {William Hersh and Chris Buckley and T. J. Leone and David Hickam}, 3797 title = {{OHSUMED}: {A}n interactive retrieval evaluation and new 3798 large test collection for research}, 3799 booktitle = {Proc. SIGIR}, 3800 year = {1994}, 3801 isbn = {0-387-19889-X}, 3802 pages = {192--201}, 3803 location = {Dublin, Ireland}, 3804 publisher = {ACM Press}, 3805 address = {New York, NY, USA}, 3806 } 3807 3808 @InProceedings{ carbonell98mmr, 3809 author = {Jaime Carbonell and Jade Goldstein}, 3810 title = {The use of {MMR}, diversity-based reranking for reordering documents and producing summaries}, 3811 booktitle = {Proc. SIGIR}, 3812 year = {1998}, 3813 isbn = {1-58113-015-5}, 3814 pages = {335--336}, 3815 location = {Melbourne, Australia}, 3816 doi = {doi.acm.org/10.1145/290941.291025}, 3817 publisher = {ACM Press}, 3818 address = {New York, NY, USA}, 3819 } 3820 3821 @Article{ kekalainen05relevance, 3822 author = {Jaana Kek{\"a}l{\"a}inen}, 3823 title = {Binary and graded relevance in {IR} evaluations -- 3824 {C}omparison of the effects on ranking of {IR} systems}, 3825 journal = {IP\&M}, 3826 volume = 41, 3827 year = 2005, 3828 pages = {1019--1033}, 3829 } 3830 3831 @Article{ harter98relevance, 3832 author = {Stephen P. Harter}, 3833 title = {Variations in relevance assessments and the measurement of 3834 retrieval effectiveness}, 3835 journal = {JASIS}, 3836 volume = {47}, 3837 year = 1998, 3838 pages = {37--49}, 3839 } 3840 3841 @InProceedings{ barzilay97chains, 3842 author = {Regina Barzilay and Michael Elhadad}, 3843 title = {Using Lexical Chains for Text Summarization}, 3844 booktitle = {Workshop on Intelligent Scalable Text Summarization}, 3845 year = 1997, 3846 pages = {10--17}, 3847 } 3848 3849 @InProceedings{ jing00reduction, 3850 author = {Hongyan Jing}, 3851 title = {Sentence reduction for automatic text summarization}, 3852 booktitle = {Proc. Conference on Applied Natural Language Processing}, 3853 year = 2000, 3854 pages = {310--315}, 3855 } 3856 3857 @Misc{ fallows04internet, 3858 author = {Deborah Fallows}, 3859 year = {2004}, 3860 title = {The Internet and Daily Life}, 3861 note = {Pew/Internet and {A}merican Life Project}, 3862 url = {www.pewinternet.org/pdfs/PIP\_Internet\_and\_Daily\_Life.pdf} 3863 , 3864 } 3865 3866 @InProceedings{ newsam01image, 3867 author = {Shawn Newsam and Sitaram Bhagavathy and B. S. Manjunath}, 3868 title = {Category-based image retrieval}, 3869 booktitle = {Proc. IEEE International Conference on Image Processing, Special Session on Multimedia Indexing, Browsing and Retrieval}, 3870 pages = {596--599}, 3871 year = 2001, 3872 } 3873 3874 % volume = {3}, 3875 3876 @InProceedings{ salton91panel, 3877 author = {Gerard Salton}, 3878 title = {The {S}mart Project in Automatic Document Retrieval}, 3879 booktitle = {Proc. SIGIR}, 3880 publisher = {ACM Press}, 3881 year = 1991, 3882 pages = {356--358}, 3883 } 3884 3885 @InProceedings{ harman92revisited, 3886 author = {Donna Harman}, 3887 title = {Relevance feedback revisited}, 3888 year = 1992, 3889 booktitle = {Proc. SIGIR}, 3890 publisher = {ACM Press}, 3891 pages = {1--10}, 3892 } 3893 3894 @InCollection{ ide71, 3895 author = {E. Ide}, 3896 title = {New experiments in relevance feedback}, 3897 editor = {Gerard Salton}, 3898 year = 1971, 3899 pages = {337--354}, 3900 crossref = {salton71smart}, 3901 } 3902 3903 @Article{ ruthven03relevance, 3904 author = {Ruthven, Ian and Lalmas, Mounia}, 3905 year = 2003, 3906 title = {A survey on the use of relevance feedback for information 3907 access systems}, 3908 journal = {Knowledge Engineering Review}, 3909 volume = 18, 3910 number = 1, 3911 } 3912 3913 @InProceedings{ buckley94relevance, 3914 author = {Chris Buckley and Gerard Salton and James Allan}, 3915 year = 1994, 3916 title = {The effect of adding relevance information in a relevance 3917 feedback environment}, 3918 booktitle = {Proc. SIGIR}, 3919 publisher = {ACM Press}, 3920 pages = {292--300}, 3921 } 3922 3923 @Article{ spink00use, 3924 author = {Amanda Spink and Bernard J. Jansen and H. Cenk Ozmultu}, 3925 title = {Use of query reformulation and relevance feedback by 3926 {E}xcite users}, 3927 journal = {Internet Research: {E}lectronic Networking Applications and 3928 Policy}, 3929 volume = 10, 3930 year = 2000, 3931 number = 4, 3932 pages = {317--328}, 3933 url = {ist.psu.edu/faculty\_pages/jjansen/academic/pubs/internetresearch2000.pdf} 3934 , 3935 } 3936 3937 @InProceedings{ xu96query, 3938 author = {Jinxi Xu and W. Bruce Croft}, 3939 year = {1996}, 3940 title = {Query Expansion Using Local and Global Document Analysis}, 3941 booktitle = {Proc. SIGIR}, 3942 publisher = {ACM Press}, 3943 pages = {4--11}, 3944 } 3945 3946 @InProceedings{ joachims05clickthrough, 3947 author = {Thorsten Joachims and Laura Granka and Bing Pan and Helene Hembrooke and Geri Gay}, 3948 title = {Accurately Interpreting Clickthrough Data as Implicit 3949 Feedback}, 3950 booktitle = {Proc. SIGIR}, 3951 publisher = {ACM Press}, 3952 year = 2005, 3953 pages = {154--161}, 3954 } 3955 3956 @InProceedings{ joachims02clickthrough, 3957 author = {Thorsten Joachims}, 3958 title = {Optimizing Search Engines Using Clickthrough Data}, 3959 booktitle = {Proc. KDD}, 3960 year = 2002, 3961 pages = {133--142}, 3962 } 3963 3964 @InProceedings{ greiff98eda, 3965 author = {Warren R. Greiff}, 3966 year = 1998, 3967 title = {A theory of term weighting based on exploratory data 3968 analysis}, 3969 booktitle = {Proc. SIGIR}, 3970 publisher = {ACM Press}, 3971 pages = {11-19}, 3972 } 3973 3974 @InProceedings{ friedman96tan, 3975 author = {Friedman, Nir and Moises Goldszmidt}, 3976 year = 1996, 3977 title = {Building Classifiers using {B}ayesian Networks}, 3978 booktitle = {Proc. National Conference on Artificial Intelligence}, 3979 pages = {1277--1284}, 3980 } 3981 3982 @Book{ ripley96, 3983 author = {B. D. Ripley}, 3984 title = {Pattern Recognition and Neural Networks}, 3985 publisher = {Cambridge University Press}, 3986 address = {Cambridge}, 3987 year = {1996}, 3988 annote = {Great statistical foundations of classification book!}, 3989 } 3990 3991 @Article{ fuhr92probabilistic, 3992 author = "Norbert Fuhr", 3993 title = "Probabilistic Models in Information Retrieval", 3994 journal = {Computer Journal}, 3995 volume = "35", 3996 number = "3", 3997 pages = "243--255", 3998 year = "1992", 3999 } 4000 4001 @article{ crestani98probabilistic, 4002 author = {Fabio Crestani and Mounia Lalmas and Cornelis J. Van Rijsbergen and Iain Campbell}, 4003 title = "Is this document relevant?\ \ldots{} probably: {A} survey of 4004 probabilistic models in information retrieval", 4005 journal = {ACM Computing Surveys}, 4006 volume = {30}, 4007 number = {4}, 4008 year = {1998}, 4009 issn = {0360-0300}, 4010 pages = {528--552}, 4011 doi = {doi.acm.org/10.1145/299917.299920}, 4012 publisher = {ACM Press}, 4013 address = {New York, NY, USA}, 4014 } 4015 4016 @Article{ sparckjones00probabilistic, 4017 author = {Sp{\"a}rck Jones, Karen and S. Walker and Stephen E. 4018 Robertson}, 4019 title = {A probabilistic model of information retrieval: 4020 Development and comparative experiments}, 4021 journal = {IP\&M}, 4022 volume = 36, 4023 number = 6, 4024 pages = {779--808, 809--840}, 4025 year = 2000, 4026 } 4027 4028 @Book{ jensen01bayesian, 4029 author = {Finn V. Jensen and Finn B. Jensen}, 4030 title = {Bayesian Networks and Decision Graphs}, 4031 year = 2001, 4032 publisher = {Springer}, 4033 address = {Berlin}, 4034 } 4035 4036 @Book{ grinstead97probability, 4037 author = {Grinstead, Charles M. and J. Laurie Snell}, 4038 year = 1997, 4039 edition = "2nd", 4040 title = {Introduction to Probability}, 4041 publisher = {American Mathematical Society}, 4042 address = {Providence, RI}, 4043 url = {www.dartmouth.edu/~chance/teaching\_aids/books\_articles/probability\_book/amsbook.mac.pdf} 4044 , 4045 } 4046 4047 @Book{ jurafsky00slp, 4048 author = {Dan Jurafsky and James H. Martin}, 4049 title = {Speech and Language Processing: {A}n Introduction to Natural 4050 Language Processing, Computational Linguistics and Speech 4051 Recognition}, 4052 publisher = {Prentice Hall}, 4053 address = {Englewood Cliffs, NJ}, 4054 year = 2000, 4055 } 4056 4057 @Book{ jurafsky08slp, 4058 author = {Dan Jurafsky and James H. Martin}, 4059 title = {Speech and Language Processing: {A}n Introduction to Natural 4060 Language Processing, Computational Linguistics and Speech 4061 Recognition}, 4062 edition = {2nd}, 4063 publisher = {Prentice Hall}, 4064 address = {Englewood Cliffs, NJ}, 4065 year = {2008} 4066 } 4067 4068 @InProceedings{ ponte98lm, 4069 author = {Jay M. Ponte and W. Bruce Croft}, 4070 title = {A language modeling approach to information retrieval}, 4071 year = 1998, 4072 booktitle = {Proc. SIGIR}, 4073 publisher = {ACM Press}, 4074 pages = {275--281}, 4075 } 4076 4077 @InProceedings{ miller99hmm, 4078 author = {David R. H. Miller and Tim Leek and Richard M. Schwartz}, 4079 title = {A Hidden {Markov} Model Information Retrieval System}, 4080 year = 1999, 4081 booktitle = {Proc. SIGIR}, 4082 publisher = {ACM Press}, 4083 pages = {214--221}, 4084 } 4085 4086 @InProceedings{ berger99ir, 4087 author = {Adam Berger and John Lafferty}, 4088 title = {Information retrieval as statistical translation}, 4089 year = 1999, 4090 booktitle = {Proc. SIGIR}, 4091 publisher = {ACM Press}, 4092 pages = {222--229}, 4093 } 4094 4095 @InProceedings{ hiemstra98linguistically, 4096 author = {Djoerd Hiemstra}, 4097 title = {A linguistically motivated probabilistic model of 4098 information retrieval}, 4099 booktitle = {Proc. ECDL}, 4100 series = {LNCS}, 4101 volume = {1513}, 4102 pages = {569--584}, 4103 year = {1998}, 4104 } 4105 4106 @Article{ hiemstra00probabilistic, 4107 author = {Djoerd Hiemstra}, 4108 title = {A probabilistic justification for using tf.idf term 4109 weighting in information retrieval}, 4110 journal = {International Journal on Digital Libraries}, 4111 volume = 3, 4112 number = 2, 4113 publisher = {Springer}, 4114 pages = {131--139}, 4115 year = 2000, 4116 issn = {1432-5012}, 4117 } 4118 4119 @InProceedings{ lafferty01risk, 4120 author = {John Lafferty and Chengxiang Zhai}, 4121 year = 2001, 4122 title = {Document language models, query models, and risk 4123 minimization for information retrieval}, 4124 booktitle = {Proc. SIGIR}, 4125 publisher = {ACM Press}, 4126 pages = {111--119}, 4127 } 4128 4129 @InProceedings{ zhai01smoothing, 4130 author = {Chengxiang Zhai and John Lafferty}, 4131 title = {A study of smoothing methods for language models applied 4132 to ad hoc information retrieval}, 4133 booktitle = {Proc. SIGIR}, 4134 publisher = {ACM Press}, 4135 year = 2001, 4136 pages = {334--342}, 4137 } 4138 4139 @Unpublished{ sparckjones04rational, 4140 author = {Sp{\"a}rck Jones, Karen}, 4141 title = "{Language modelling's generative model: {I}s it rational?}", 4142 year = 2004, 4143 note = {MS, Computer Laboratory, University of Cambridge}, 4144 url = {www.cl.cam.ac.uk/~ksj21/langmodnote4.pdf}, 4145 } 4146 4147 @InProceedings{ zhai01feedback, 4148 author = {Chengxiang Zhai and John Lafferty}, 4149 title = {Model-based feedback in the language modeling approach to 4150 information retrieval}, 4151 booktitle = {Proc. CIKM}, 4152 publisher = {ACM Press}, 4153 year = 2001, 4154 } 4155 4156 @Book{ croft03lm, 4157 editor = {Croft, W. Bruce and John Lafferty}, 4158 title = {Language Modeling for Information Retrieval}, 4159 year = 2003, 4160 address = {New York}, 4161 publisher = {Springer}, 4162 } 4163 4164 @InProceedings{ caruana06empirical, 4165 author = {Rich Caruana and Alexandru Niculescu-Mizil}, 4166 title = {An Empirical Comparison of Supervised Learning 4167 Algorithms}, 4168 year = 2006, 4169 booktitle = {Proc. ICML}, 4170 } 4171 4172 @InProceedings{ dumais98inductive, 4173 author = {Susan Dumais and John Platt and David Heckerman and Mehran Sahami}, 4174 title = {Inductive learning algorithms and representations for text categorization}, 4175 booktitle = {Proc. CIKM}, 4176 year = {1998}, 4177 isbn = {1-58113-061-9}, 4178 pages = {148--155}, 4179 location = {Bethesda, Maryland, United States}, 4180 doi = {doi.acm.org/10.1145/288627.288651}, 4181 publisher = {ACM Press}, 4182 address = {New York, NY, USA}, 4183 } 4184 4185 4186 4187 @Article{ hand06classifier, 4188 author = {David J. Hand}, 4189 title = {Classifier Technology and the Illusion of Progress}, 4190 year = 2006, 4191 journal = {Statistical Science}, 4192 volume = 21, 4193 pages = {1--14}, 4194 } 4195 4196 @Book{ shawe-taylor04kernel, 4197 author = {John Shawe-Taylor and Nello Cristianini}, 4198 title = {Kernel Methods for Pattern Analysis}, 4199 year = 2004, 4200 publisher = {Cambridge University Press}, 4201 } 4202 4203 @Book{ cristianini00svm, 4204 author = {Nello Cristianini and John Shawe-Taylor}, 4205 year = 2000, 4206 title = {Introduction to Support Vector Machines and Other 4207 Kernel-based Learning Methods}, 4208 publisher = {Cambridge University Press}, 4209 } 4210 4211 @Book{ schoelkopf01kernels, 4212 author = {Bernhard Sch{\"o}lkopf and Alexander J. Smola}, 4213 title = {Learning with Kernels: {S}upport Vector Machines, 4214 Regularization, Optimization, and Beyond}, 4215 year = 2001, 4216 publisher = {MIT Press}, 4217 } 4218 4219 @Article{ burges98svm, 4220 author = {Burges, Christopher J. C.}, 4221 title = {A Tutorial on Support Vector Machines for Pattern 4222 Recognition}, 4223 journal = {Data Mining and Knowledge Discovery}, 4224 volume = 2, 4225 number = 2, 4226 pages = {121--167}, 4227 year = 1998, 4228 } 4229 4230 @Article{ chen05nusvm, 4231 author = {Pai-Hsuen Chen and Chih-Jen Lin and Bernhard Sch{\"o}lkopf}, 4232 year = 2005, 4233 title = {A tutorial on $\nu$-Support Vector Machines}, 4234 journal = {Applied Stochastic Models in Business and Industry}, 4235 volume = 21, 4236 pages = {111--136}, 4237 } 4238 4239 @Book{ vapnik98statistical, 4240 author = {Vladimir N. Vapnik}, 4241 title = {Statistical Learning Theory}, 4242 publisher = {Wiley-Interscience}, 4243 year = 1998, 4244 } 4245 4246 @Article{ lodhi02text, 4247 author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and 4248 Nello Cristianini and Chris Watkins}, 4249 year = 2002, 4250 title = {Text Classification using String Kernels}, 4251 journal = {JMLR}, 4252 volume = 2, 4253 pages = {419--444}, 4254 } 4255 4256 @InProceedings{ gaertner02kernels, 4257 author = {Thomas Gaertner and John W. Lloyd and Peter A. Flach}, 4258 year = 2002, 4259 title = {Kernels for structured data}, 4260 booktitle = {Proc. International Conference on Inductive Logic Programming}, 4261 pages = {66--83}, 4262 } 4263 4264 @InProceedings{ joachims98text, 4265 author = "Thorsten Joachims", 4266 title = "Text categorization with support vector machines: {L}earning 4267 with many relevant features", 4268 booktitle = {Proc. ECML}, 4269 publisher = {Springer}, 4270 address = "Heidelberg", 4271 pages = "137--142", 4272 year = "1998", 4273 } 4274 4275 % series = {Lecture Notes in Artificial Intelligence}, 4276 % number = "1398", 4277 % editor = "Claire N{\'e}dellec and C{\'e}line Rouveirol", 4278 4279 4280 @Article{ 359041, 4281 author = {James L. Peterson}, 4282 title = {Computer programs for detecting and correcting spelling errors}, 4283 publisher = {ACM Press}, 4284 journal = {CACM}, 4285 volume = {23}, 4286 number = {12}, 4287 year = {1980}, 4288 issn = {0001-0782}, 4289 pages = {676--687}, 4290 doi = {doi.acm.org/10.1145/359038.359041}, 4291 address = {New York, NY}, 4292 } 4293 4294 @Article{ 363994, 4295 author = {Fred J. Damerau}, 4296 title = {A technique for computer detection and correction of spelling errors}, 4297 publisher = {ACM Press}, 4298 journal = {CACM}, 4299 volume = {7}, 4300 number = {3}, 4301 year = {1964}, 4302 issn = {0001-0782}, 4303 pages = {171--176}, 4304 doi = {doi.acm.org/10.1145/363958.363994}, 4305 address = {New York, NY}, 4306 } 4307 4308 @Article{ 146380, 4309 author = {Karen Kukich}, 4310 title = {Techniques for automatically correcting words in text}, 4311 publisher = {ACM Press}, 4312 journal = {ACM Computing Surveys}, 4313 volume = {24}, 4314 number = {4}, 4315 year = {1992}, 4316 issn = {0360-0300}, 4317 pages = {377--439}, 4318 doi = {doi.acm.org/10.1145/146370.146380}, 4319 address = {New York, NY}, 4320 } 4321 4322 @Article{ permuterm, 4323 author = {Eugene Garfield}, 4324 title = "The Permuterm Subject Index: {A}n Autobiographic Review", 4325 journal = {JASIS}, 4326 year = {1976}, 4327 pages = {288--291}, 4328 volume = {27}, 4329 number = {5-6}, 4330 } 4331 4332 @Article{ zobel95finding, 4333 author = "Justin Zobel and Philip Dart", 4334 title = "Finding Approximate Matches in Large Lexicons", 4335 journal = {Software Practice and Experience}, 4336 year = "1995", 4337 volume = "25", 4338 number = "3", 4339 pages = "331--345", 4340 url = "citeseer.ifi.unizh.ch/zobel95finding.html", 4341 } 4342 4343 @Article{ editdistance, 4344 author = "Vladimir I. Levenshtein", 4345 title = "Binary codes capable of correcting spurious insertions and deletions of ones", 4346 journal = {Problems of Information Transmission}, 4347 volume = {1}, 4348 pages = {8--17}, 4349 year = {1965}, 4350 } 4351 4352 @Article{ liu05svms, 4353 author = {Tie-Yan Liu and Yiming Yang and Hao Wan and Hua-Jun Zeng 4354 and Zheng Chen and Wei-Ying Ma}, 4355 title = {Support Vector Machines Classification with Very Large 4356 Scale Taxonomy}, 4357 journal = {ACM SIGKDD Explorations}, 4358 volume = 7, 4359 number = 1, 4360 pages = {36--43}, 4361 year = 2005, 4362 } 4363 4364 @Article{ weigend99hierarchy, 4365 author = {Andreas S. Weigend and Erik D. Wiener and Jan O. Pedersen}, 4366 title = {Exploiting Hierarchy in Text Categorization}, 4367 year = 1999, 4368 journal = {IR}, 4369 volume = 1, 4370 number = 3, 4371 pages = {193--216}, 4372 } 4373 4374 @InProceedings{ koller97hierarchy, 4375 author = {Koller, Daphne and Sahami, Mehran}, 4376 year = 1997, 4377 title = {Hierarchically Classifying Documents Using Very Few 4378 Words}, 4379 booktitle = {Proc. ICML}, 4380 pages = {170-178}, 4381 } 4382 4383 @InProceedings{ dumais00hierarchical, 4384 author = "Susan T. Dumais and Hao Chen", 4385 title = "Hierarchical classification of {W}eb content", 4386 booktitle = {Proc. SIGIR}, 4387 publisher = {ACM Press}, 4388 pages = "256--263", 4389 year = "2000", 4390 } 4391 4392 @Article{ pugh90skip, 4393 author = {William Pugh}, 4394 title = {Skip lists: {A} probabilistic alternative to balanced 4395 trees}, 4396 journal = {CACM}, 4397 volume = 33, 4398 number = 6, 4399 pages = {668--676}, 4400 year = 1990, 4401 } 4402 4403 @InProceedings{papineni01why, 4404 author = {Kishore Papineni}, 4405 title = {Why Inverse Document Frequency?}, 4406 booktitle = {Proc. North American Chapter of the Association for Computational Linguistics}, 4407 year = 2001, 4408 pages = {1--8}, 4409 } 4410 4411 @techreport{ buckleysalton_termweighting, 4412 author = {Gerard Salton and Chris Buckley}, 4413 title = {Term Weighting Approaches in Automatic Text Retrieval}, 4414 year = {1987}, 4415 institution = {Cornell University}, 4416 address = {Ithaca, NY, USA}, 4417 } 4418 4419 @Article{ salton88term, 4420 author = {Gerard Salton and Christopher Buckley}, 4421 title = {Term-Weighting Approaches in Automatic Text Retrieval}, 4422 year = {1988}, 4423 journal = {IP\&M}, 4424 volume = {24}, 4425 number = {5}, 4426 pages = {513--523}, 4427 } 4428 4429 @Article{ luhn57, 4430 author = {Hans Peter Luhn}, 4431 title = {A statistical approach to mechanized encoding and 4432 searching of literary information}, 4433 journal = {IBM Journal of Research and Development}, 4434 volume = {1}, 4435 number = {4}, 4436 pages = {309--317}, 4437 year = {1957}, 4438 } 4439 4440 @Article{ luhn58, 4441 author = {Hans Peter Luhn}, 4442 title = {The Automatic Creation of Literature Abstracts}, 4443 journal = {IBM Journal of Research and Development}, 4444 volume = {2}, 4445 number = {2}, 4446 pages = {159--165, 317}, 4447 year = {1958}, 4448 } 4449 4450 @InProceedings{ singhal96pivoted, 4451 author = "Amit Singhal and Chris Buckley and Mandar Mitra", 4452 title = "Pivoted Document Length Normalization", 4453 booktitle = {Proc. SIGIR}, 4454 publisher = {ACM Press}, 4455 pages = "21-29", 4456 year = "1996", 4457 url = "citeseer.ist.psu.edu/singhal96pivoted.html", 4458 } 4459 4460 @InProceedings{ toutanova02pronunciation, 4461 author = {Kristina Toutanova and Robert C. Moore}, 4462 title = {Pronunciation Modeling for Improved Spelling Correction}, 4463 booktitle = {Proc. ACL}, 4464 year = 2002, 4465 pages = {144--151}, 4466 } 4467 4468 @InProceedings{ kernighan90spelling, 4469 author = {Mark D. Kernighan and Kenneth W. Church and William A. Gale}, 4470 title = {A spelling correction program based on a noisy channel model}, 4471 year = 1990, 4472 booktitle = {Proc. ACL}, 4473 pages = {205--210}, 4474 } 4475 4476 % volume = 2, 4477 4478 @InProceedings{ brill00improved, 4479 author = {Eric Brill and Robert C. Moore}, 4480 year = 2000, 4481 title = {An improved error model for noisy channel spelling 4482 correction}, 4483 booktitle = {Proc. ACL}, 4484 pages = {286--293}, 4485 } 4486 4487 @TechReport{ viewingterm, 4488 author = "Ruihua Song and Ji-Rong Wen and Wei-Ying Ma", 4489 title = {Viewing Term Proximity from a Different Perspective}, 4490 institution = "Microsoft Research", 4491 year = 2005, 4492 number = {MSR-TR-2005-69}, 4493 } 4494 4495 @Article{ onetothree, 4496 author = {Charles L.A. Clarke and Gordon V. Cormack and Elizabeth A. Tudhope}, 4497 title = {Relevance ranking for one to three term queries}, 4498 journal = {IP\&M}, 4499 volume = {36}, 4500 year = {2000}, 4501 pages = {291--311}, 4502 } 4503 4504 @Article{ gao05chinese, 4505 title = {{C}hinese Word Segmentation and Named Entity Recognition: 4506 A Pragmatic Approach}, 4507 author = {Jianfeng Gao and Mu Li and Chang-Ning Huang and Andi Wu}, 4508 journal = {Computational Linguistics}, 4509 month = {Dec}, 4510 year = 2005, 4511 volume = 31, 4512 number = 4, 4513 pages = {531--574}, 4514 } 4515 4516 @InProceedings{ cavnar94ngram, 4517 author = {William B. Cavnar and John M. Trenkle}, 4518 title = {N-Gram-Based Text Categorization}, 4519 booktitle = {Proc. SDAIR}, 4520 pages = {161--175}, 4521 year = 1994, 4522 } 4523 4524 @TechReport{ dunning94identification, 4525 author = {Ted Dunning}, 4526 title = {Statistical Identification of Language}, 4527 institution = {Computing Research Laboratory, New Mexico State 4528 University}, 4529 number = {94-273}, 4530 year = 1994, 4531 howpublished = {Inside the package 4532 \url{ftp://crl.nmsu.edu/pub/misc/lingdet\_suite.tar.gz}}, 4533 } 4534 4535 @Book{ konheim81cryptography, 4536 author = {Alan G. Konheim}, 4537 year = 1981, 4538 title = {Cryptography: {A} Primer}, 4539 publisher = {John Wiley \& Sons}, 4540 } 4541 4542 @InProceedings{ beesley88identifier, 4543 author = {Kenneth R. Beesley}, 4544 year = 1998, 4545 title = {Language Identifier: {A} Computer Program for Automatic 4546 Natural-Language Identification of On-Line Text}, 4547 booktitle = {Languages at Crossroads: {P}roc. Annual Conference of the American Translators Association}, 4548 pages = {47--54}, 4549 } 4550 4551 @InProceedings{ hughes06identification, 4552 author = {Hughes, Baden and Baldwin, Timothy and Bird, Steven and 4553 Nicholson, Jeremy and MacKinlay, Andrew}, 4554 year = 2006, 4555 title = {Reconsidering Language Identification for Written Language 4556 Resources}, 4557 booktitle = {Proc. International Conference on Language Resources and Evaluation}, 4558 pages = {485--488}, 4559 } 4560 4561 @InProceedings{ mckeown02news, 4562 author = {Kathleen R. McKeown and Regina Barzilay and David Evans 4563 and Vasileios Hatzivassiloglou and Judith L. Klavans and 4564 Ani Nenkova and Carl Sable and Barry Schiffman and Sergey 4565 Sigelman}, 4566 title = {Tracking and Summarizing News on a Daily Basis with 4567 {C}olumbia's {N}ewsblaster}, 4568 booktitle = {Proc. Human Language Technology Conference}, 4569 year = 2002, 4570 } 4571 4572 @InProceedings{ chen00multilingual, 4573 author = {Hsin-Hsi Chen and Chuan-Jie Lin}, 4574 year = 2000, 4575 title = {A Multilingual News Summarizer}, 4576 booktitle = {Proc. COLING}, 4577 pages = {159-165}, 4578 } 4579 4580 @Book{ sproat92morphology, 4581 author = {Sproat, Richard William}, 4582 title = {Morphology and computation}, 4583 publisher = {MIT Press}, 4584 address = {Cambridge, MA}, 4585 year = {1992}, 4586 } 4587 4588 @Book{ beesley03finite, 4589 title = {Finite State Morphology}, 4590 author = {Kenneth R. Beesley and Lauri Karttunen}, 4591 publisher = {CSLI Publications}, 4592 address = {Stanford, CA}, 4593 year = 2003, 4594 } 4595 4596 @Article{ zobel06inverted, 4597 author = {Justin Zobel and Alistair Moffat}, 4598 year = {2006}, 4599 title = {Inverted Files for Text Search Engines}, 4600 journal = {ACM Computing Surveys}, 4601 volume = {38}, 4602 number = {2}, 4603 } 4604 4605 @TechReport{ mercator1, 4606 author = "Marc Najork and Allan Heydon", 4607 title = {High-Performance Web Crawling}, 4608 institution = "Compaq Systems Research Center", 4609 year = 2001, 4610 number = {173}, 4611 } 4612 4613 @InCollection{ najorkheydon2002, 4614 author = "Marc Najork and Allan Heydon", 4615 title = "High-Performance Web Crawling", 4616 booktitle = {Handbook of Massive Data Sets}, 4617 publisher = "Kluwer", 4618 year = 2002, 4619 editor = {James Abello and Panos Pardalos and Mauricio Resende}, 4620 chapter = 2, 4621 } 4622 4623 @InProceedings{brinpageanatomy, 4624 author = {Sergey Brin and Lawrence Page}, 4625 year = 1998, 4626 title = {The anatomy of a large-scale hypertextual Web search 4627 engine}, 4628 booktitle = {Proc. WWW}, 4629 pages = {107--117}, 4630 } 4631 4632 @InProceedings{ chocrawling1998, 4633 author = {Junghoo Cho and Hector Garcia-Molina and Lawrence Page}, 4634 year = 1998, 4635 title = {Efficient crawling through {URL} ordering}, 4636 booktitle = {Proc. WWW}, 4637 pages = {161--172}, 4638 } 4639 4640 @InProceedings{ webbase2000, 4641 author = {Jun Hirai and Sriram Raghavan and Hector Garcia-Molina and 4642 Andreas Paepcke}, 4643 year = 2000, 4644 title = {{WebBase}: {A} repository of web pages}, 4645 booktitle = {Proc. WWW}, 4646 pages = {277--293}, 4647 } 4648 4649 @Article{ burnercrawling, 4650 author = {Mike Burner}, 4651 title = {Crawling towards Eternity: {B}uilding an archive of the 4652 {W}orld {W}ide {W}eb}, 4653 journal = {Web Techniques Magazine}, 4654 volume = 2, 4655 number = 5, 4656 year = 1997, 4657 } 4658 4659 @InProceedings{ connserver, 4660 author = {Krishna Bharat and Andrei Broder and Monika Henzinger and 4661 Puneet Kumar and Suresh Venkatasubramanian}, 4662 year = 1998, 4663 title = {The connectivity server: {F}ast access to linkage 4664 information on the Web}, 4665 booktitle = {Proc. WWW}, 4666 pages = {469--477}, 4667 } 4668 4669 @InProceedings{ boldivigna1, 4670 author = {Paolo Boldi and Sebastiano Vigna}, 4671 year = 2004, 4672 title = "{The WebGraph framework I: {C}ompression techniques}", 4673 booktitle = {Proc. WWW}, 4674 publisher = {ACM Press}, 4675 pages = {595--601}, 4676 } 4677 4678 @Article{ boldivigna2, 4679 author = "Paolo Boldi and Sebastiano Vigna", 4680 title = {Codes for the {World-Wide Web}}, 4681 journal = {Internet Mathematics}, 4682 year = 2004, 4683 pages = {405--427}, 4684 volume = {2}, 4685 number = {4}, 4686 } 4687 4688 @TechReport{ page98pagerank, 4689 author = "Lawrence Page and Sergey Brin and Rajeev Motwani and Terry 4690 Winograd", 4691 institution = "Stanford Digital Library Technologies Project", 4692 title = "The {PageRank} Citation Ranking: {B}ringing Order to the Web", 4693 year = "1998", 4694 url = "citeseer.ist.psu.edu/page98pagerank.html", 4695 } 4696 4697 @InProceedings{ haveliwala02topicsensitive, 4698 author = "Taher H. Haveliwala", 4699 title = "Topic-sensitive {PageRank}", 4700 booktitle = {Proc. WWW}, 4701 address = "Honolulu, HI", 4702 month = may, 4703 year = 2002, 4704 url = "citeseer.ist.psu.edu/haveliwala02topicsensitive.html", 4705 } 4706 4707 @article{ haveliwala03topicsensitive, 4708 author = "Taher Haveliwala", 4709 title = "Topic-sensitive {PageRank}: {A} context-sensitive ranking algorithm for web search", 4710 journal = {IEEE Transactions on Knowledge and Data Engineering}, 4711 volume = 15, 4712 number = 4, 4713 pages = {784--796}, 4714 year = "2003", 4715 url = "citeseer.ist.psu.edu/article/haveliwala03topicsensitive.html" 4716 , 4717 } 4718 4719 @Article{ kleinberg99authoritative, 4720 author = "Jon M. Kleinberg", 4721 title = "Authoritative sources in a hyperlinked environment", 4722 journal = {JACM}, 4723 volume = "46", 4724 number = "5", 4725 pages = "604--632", 4726 year = "1999", 4727 url = "citeseer.ist.psu.edu/article/kleinberg98authoritative.html" 4728 , 4729 } 4730 4731 @InProceedings{jehwidom, 4732 author = {Glen Jeh and Jennifer Widom}, 4733 title = "{Scaling personalized web search}", 4734 booktitle = {Proc. WWW}, 4735 publisher = {ACM Press}, 4736 year = {2003}, 4737 pages = {271--279}, 4738 location = {Budapest}, 4739 address = {New York, NY}, 4740 } 4741 4742 @InProceedings{ chakrabarti98automatic, 4743 author = "Soumen Chakrabarti and Byron Dom and David Gibson and Jon Kleinberg 4744 and Prabhakar Raghavan and Sridhar Rajagopalan", 4745 title = "Automatic resource list compilation by analyzing hyperlink 4746 structure and associated text", 4747 booktitle = {Proc. WWW}, 4748 year = "1998", 4749 url = "citeseer.ist.psu.edu/chakrabarti98automatic.html", 4750 } 4751 4752 @InProceedings{ bharat98improved, 4753 author = "Krishna Bharat and Monika R. Henzinger", 4754 title = "Improved algorithms for topic distillation in a 4755 hyperlinked environment", 4756 booktitle = {Proc. SIGIR}, 4757 publisher = {ACM Press}, 4758 address = "Melbourne, AU", 4759 pages = "104--111", 4760 year = "1998", 4761 url = "citeseer.ist.psu.edu/bharat98improved.html", 4762 } 4763 4764 @InProceedings{ ng01link, 4765 author = "Andrew Y. Ng and Alice X. Zheng and Michael I. Jordan", 4766 title = "Link Analysis, Eigenvectors and Stability", 4767 booktitle = {Proc. IJCAI}, 4768 pages = "903-910", 4769 year = "2001", 4770 url = "citeseer.ist.psu.edu/ng01link.html", 4771 } 4772 4773 @InProceedings{ borodintsaparas, 4774 author = {Allan Borodin and Gareth O. Roberts and Jeffrey S. Rosenthal and Panayiotis Tsaparas}, 4775 title = {Finding authorities and hubs from link structures on the 4776 {World Wide Web}}, 4777 booktitle = {Proc. WWW}, 4778 year = "2001", 4779 pages = "415--429", 4780 } 4781 4782 @Article{ lempel00stochastic, 4783 author = {Ronny Lempel and Shlomo Moran}, 4784 title = "The stochastic approach for link-structure analysis 4785 {({SALSA})} and the {TKC} effect", 4786 journal = {Computer Networks}, 4787 volume = "33", 4788 number = "1--6", 4789 pages = "387--401", 4790 year = "2000", 4791 url = "citeseer.ist.psu.edu/lempel00stochastic.html", 4792 } 4793 4794 @TechReport{ baeza05choice, 4795 abstract = {This paper studies a family of link-based algorithms that 4796 propagate page importance through links. In these 4797 algorithms there is a damping function that decreases with 4798 the distance, so a direct link implies more endorsement 4799 than a link through a long path. {PageRank} is the most 4800 widely known ranking function of this family. We focus on 4801 three damping functions, having linear, exponential, and 4802 hyperbolic decay on the lengths of the paths. The 4803 exponential decay corresponds to {PageRank}, and the other 4804 functions are new. Our analysis includes a comparison among 4805 them and experiments for studying their behavior under 4806 different parameters.}, 4807 author = {Ricardo Baeza-Yates and Paolo Boldi and Carlos Castillo}, 4808 citeulike-article-id={322774}, 4809 institution = {Dipartimento di Scienze dell'Informazione, Universit\`{a} 4810 degli Studi di Milano}, 4811 keywords = {ranking web-graph}, 4812 month = {September}, 4813 priority = {0}, 4814 title = {The Choice of a Damping Function for Propagating 4815 Importance in Link-Based Ranking}, 4816 year = {2005}, 4817 } 4818 4819 @inproceedings{ boldi05pagerank, 4820 author = {Paolo Boldi and Massimo Santini and Sebastiano Vigna}, 4821 title = "{PageRank} as a function of the damping factor", 4822 booktitle = {Proc. WWW}, 4823 year = "2005", 4824 url = "citeseer.ist.psu.edu/boldi05pagerank.html", 4825 } 4826 4827 @Article{ berkhinpagerank, 4828 author = "Pavel Berkhin", 4829 title = "A survey on pagerank computing", 4830 journal = {Internet Mathematics}, 4831 volume = "2", 4832 number = "1", 4833 pages = "73--120", 4834 year = "2005", 4835 } 4836 4837 @InProceedings{ boldi02ubicrawler, 4838 author = {Paolo Boldi and Bruno Codenotti and Massimo Santini and 4839 Sebastiano Vigna}, 4840 title = "Ubicrawler: {A} scalable fully distributed web crawler", 4841 booktitle = {Proc. Australian World Wide Web Conference}, 4842 year = "2002", 4843 url = "citeseer.ist.psu.edu/article/boldi03ubicrawler.html", 4844 } 4845 4846 @InProceedings{ shkapenyuk02design, 4847 author = "Vladislav Shkapenyuk and Torsten Suel", 4848 title = "Design and Implementation of a High-Performance 4849 Distributed Web Crawler", 4850 booktitle = {Proc. International Conference on Data Engineering}, 4851 year = "2002", 4852 url = "citeseer.ist.psu.edu/shkapenyuk02design.html", 4853 } 4854 4855 @Article{ 321094, 4856 author = {Charles P. Bourne and Donald F. Ford}, 4857 title = {A Study of Methods for Systematically Abbreviating {E}nglish 4858 Words and Names}, 4859 publisher = {ACM Press}, 4860 journal = {JACM}, 4861 volume = {8}, 4862 number = {4}, 4863 year = {1961}, 4864 issn = {0004-5411}, 4865 pages = {538--552}, 4866 doi = {doi.acm.org/10.1145/321088.321094}, 4867 address = {New York, NY}, 4868 } 4869 4870 @InProceedings{garcia04access, 4871 title = {Access-ordered indexes}, 4872 booktitle = {Proc. Australasian Conference on Computer Science}, 4873 pages = {7--14}, 4874 year = 2004, 4875 author = {Steven Garcia and Hugh E. Williams and Adam Cannane}, 4876 } 4877 4878 @InCollection{robertson05okapi, 4879 author = {Stephen Robertson}, 4880 title = {How {O}kapi came to {TREC}}, 4881 crossref = {voorhees05experiment}, 4882 year = 2005, 4883 pages = {287--299}, 4884 } 4885 4886 % editor = {E.M. Voorhees and D.K. Harman}, 4887 % booktitle = {{TREC}: {E}xperiments and Evaluation in Information 4888 % Retrieval}, 4889 % publisher = {MIT Press}, 4890 4891 @Article{ aizerman64theoretical, 4892 author = {Mark A. Aizerman and Emmanuel M. Braverman and Lev I. Rozono\'{e}r}, 4893 year = 1964, 4894 title = {Theoretical foundations of the potential function method 4895 in pattern recognition learning}, 4896 journal = {Automation and Remote Control}, 4897 volume = 25, 4898 pages = {821--837}, 4899 } 4900 4901 @InProceedings{ radev01interactive, 4902 author = {Dragomir R. Radev and Sasha Blair-Goldensohn and Zhu Zhang 4903 and Revathi Sundara Raghavan}, 4904 year = 2001, 4905 title = {Interactive, Domain-Independent Identification and 4906 Summarization of Topically Related News Articles}, 4907 booktitle = {Proc. European Conference on Research and Advanced Technology for Digital Libraries}, 4908 pages = {225--238}, 4909 } 4910 4911 @Book{ knuthvol3, 4912 author = {Donald E. Knuth}, 4913 year = {1997}, 4914 title = {The Art of Computer Programming, Volume 3: {S}orting and 4915 Searching}, 4916 publisher = {Addison Wesley}, 4917 edition = {3rd}, 4918 } 4919 4920 @InProceedings{ cohen98learning, 4921 author = "William W. Cohen and Robert E. Schapire and Yoram Singer", 4922 title = "Learning to Order Things", 4923 booktitle = {Proc. NIPS}, 4924 publisher = "The {MIT} Press", 4925 year = "1998", 4926 url = "citeseer.ist.psu.edu/article/cohen98learning.html", 4927 } 4928 4929 % editor = "Michael I. Jordan and Michael J. Kearns and Sara A. Solla", 4930 % volume = "10", 4931 4932 @InProceedings{ zaragoza03bayesian, 4933 author = {Hugo Zaragoza and Djoerd Hiemstra and Michael Tipping and 4934 Stephen Robertson}, 4935 title = {Bayesian Extension to the Language Model for Ad Hoc 4936 Information Retrieval}, 4937 booktitle = {Proc. SIGIR}, 4938 publisher = {ACM Press}, 4939 year = {2003}, 4940 pages = {4--9}, 4941 } 4942 4943 @InCollection{ hiemstra05lm, 4944 author = {Djoerd Hiemstra and Wessel Kraaij}, 4945 title = {A Language-Modeling Approach to {TREC}}, 4946 crossref = {voorhees05experiment}, 4947 year = 2005, 4948 pages = {373--395}, 4949 } 4950 4951 @InProceedings{ gao04dependence, 4952 author = {Jianfeng Gao and Jian-Yun Nie and Guangyuan Wu and Guihong 4953 Cao}, 4954 title = {Dependence language model for information retrieval}, 4955 booktitle = {Proc. SIGIR}, 4956 publisher = {ACM Press}, 4957 pages = {170--177}, 4958 year = 2004, 4959 } 4960 4961 @InProceedings{ cao05integrating, 4962 author = {Guihong Cao and Jian-Yun Nie and Jing Bai}, 4963 title = {Integrating word relationships into language models}, 4964 booktitle = {Proc. SIGIR}, 4965 publisher = {ACM Press}, 4966 pages = {298--305}, 4967 year = 2005, 4968 } 4969 4970 @InProceedings{ santos01distributed, 4971 author = {Claudine Santos Badue and Ricardo A. Baeza-Yates and 4972 Berthier Ribeiro-Neto and Nivio Ziviani}, 4973 title = {Distributed Query Processing Using Partitioned Inverted 4974 Files}, 4975 booktitle = {Proc. SPIRE}, 4976 year = 2001, 4977 pages = {10--20}, 4978 } 4979 4980 @InProceedings{ ribeiro-neto98query, 4981 author = {Berthier A. Ribeiro-Neto and Ramurti A. Barbosa}, 4982 title = {Query Performance for Tightly Coupled Distributed Digital 4983 Libraries}, 4984 booktitle = {Proc. ACM Conference on Digital Libraries}, 4985 year = 1998, 4986 pages = {182--190}, 4987 } 4988 4989 @Article{ tomasic93query, 4990 author = {Anthony Tomasic and Hector Garcia-Molina}, 4991 title = {Query Processing and Inverted Indices in Shared-Nothing 4992 Document Information Retrieval Systems}, 4993 journal = {VLDB Journal}, 4994 volume = {2}, 4995 number = {3}, 4996 year = {1993}, 4997 pages = {243--275}, 4998 } 4999 5000 @Article{ jeong95inverted, 5001 author = {Byeong-Soo Jeong and Edward Omiecinski}, 5002 title = {Inverted File Partitioning Schemes in Multiple Disk 5003 Systems}, 5004 journal = {IEEE Transactions on Parallel and Distributed Systems}, 5005 volume = 6, 5006 number = 2, 5007 pages = {142--153}, 5008 year = 1995, 5009 } 5010 5011 @InProceedings{ macfarlane00parallel, 5012 author = {A. MacFarlane and J.A. McCann and S.E. Robertson}, 5013 title = {Parallel Search using Partitioned Inverted Files}, 5014 booktitle = {Proc. SPIRE}, 5015 year = {2000}, 5016 pages = {209--220}, 5017 } 5018 5019 @InCollection{ liddy05automatic, 5020 author = {Elizabeth D. Liddy}, 5021 year = 2005, 5022 title = {Automatic Document Retrieval}, 5023 booktitle = {Encyclopedia of Language and Linguistics}, 5024 edition = {2nd}, 5025 publisher = {Elsevier}, 5026 } 5027 5028 @Article{ bush45memex, 5029 author = {Vannevar Bush}, 5030 title = {As We May Think}, 5031 journal = {The Atlantic Monthly}, 5032 year = {1945}, 5033 url = {www.theatlantic.com/doc/194507/bush}, 5034 } 5035 5036 @Book{ taube58information, 5037 editor = {Mortimer Taube and Harold Wooster}, 5038 year = 1958, 5039 title = {Information storage and retrieval: {T}heory, systems, and 5040 devices}, 5041 address = {New York}, 5042 publisher = {Columbia University Press}, 5043 } 5044 5045 @InCollection{ mooers61mathematical, 5046 author = {Mooers, Calvin}, 5047 title = {From a point of view of mathematical etc. techniques}, 5048 pages = {xvii--xxiii}, 5049 editor = {Fairthorne, R. A.}, 5050 booktitle = {Towards information retrieval}, 5051 address = {London}, 5052 publisher = {Butterworths}, 5053 year = 1961, 5054 } 5055 5056 @inproceedings{long03optimized, 5057 author = "Xiaohui Long and Torsten Suel", 5058 title = "Optimized Query Execution in Large Search Engines with 5059 Global Page Ordering", 5060 booktitle = {Proc. VLDB}, 5061 year = "2003", 5062 url = "citeseer.ist.psu.edu/long03optimized.html", 5063 } 5064 5065 @Book{ spink05cognitive, 5066 editor = {Amanda Spink and Charles Cole}, 5067 title = {New Directions in Cognitive Information Retrieval}, 5068 year = 2005, 5069 publisher = {Springer}, 5070 } 5071 5072 @InProceedings{ zobel96phonetic, 5073 author = "Justin Zobel and Philip Dart", 5074 title = "Phonetic String Matching: {L}essons from Information 5075 Retrieval", 5076 year = 1996, 5077 booktitle = {Proc. SIGIR}, 5078 publisher = {ACM Press}, 5079 pages = "166--173", 5080 } 5081 5082 @InProceedings{ cucerzan04spelling, 5083 author = {Silviu Cucerzan and Eric Brill}, 5084 title = {Spelling Correction as an Iterative Process that Exploits 5085 the Collective Knowledge of Web Users}, 5086 booktitle = {Proc. Empirical Methods in Natural Language Processing}, 5087 year = 2004, 5088 } 5089 5090 @InProceedings{chierichetti2007, 5091 author = {Flavio Chierichetti and Alessandro Panconesi and Prabhakar Raghavan and Mauro Sozio and Alessandro Tiberi and Eli 5092 Upfal}, 5093 title = {Finding Near Neighbors Through Cluster Pruning}, 5094 booktitle = {Proc. PODS}, 5095 year = 2007, 5096 } 5097 5098 @Article{ eckartyoung, 5099 author = {Carl Eckart and Gale Young}, 5100 title = {The approximation of a matrix by another of lower rank}, 5101 journal = {Psychometrika}, 5102 volume = 1, 5103 pages = {211-218}, 5104 year = 1936, 5105 } 5106 5107 @InProceedings{th:plsi, 5108 author = "Thomas Hofmann", 5109 title = "{P}robabilistic {L}atent {S}emantic {I}ndexing", 5110 booktitle = {Proc. SIGIR}, 5111 publisher = {ACM Press}, 5112 address = "Berkeley, California", 5113 pages = "50-57", 5114 month = "August", 5115 year = "1999", 5116 url = "citeseer.ist.psu.edu/article/hofmann99probabilistic.html", 5117 } 5118 5119 @InProceedings{hofmann99probabilistic, 5120 author = "Thomas Hofmann", 5121 title = "{P}robabilistic {L}atent {S}emantic {I}ndexing", 5122 booktitle = {Proc. UAI}, 5123 address = "Stockholm", 5124 year = "1999", 5125 url = "citeseer.ist.psu.edu/hofmann99probabilistic.html", 5126 } 5127 5128 @Book{ strang, 5129 editor = {Gilbert Strang}, 5130 title = {Introduction to Applied Mathematics}, 5131 year = 1986, 5132 publisher = {Wellesley-Cambridge Press}, 5133 } 5134 5135 @Article{bernerslee92worldwide, 5136 author = "Tim Berners-Lee and Robert Cailliau and Jean-Francois 5137 Groff and Bernd Pollermann", 5138 title = "{World-Wide Web}: {T}he Information Universe", 5139 journal = {Electronic Networking: {R}esearch, Applications and Policy}, 5140 volume = "1", 5141 number = "2", 5142 pages = "74-82", 5143 year = "1992", 5144 url = "citeseer.ist.psu.edu/article/berners-lee92worldwide.html", 5145 } 5146 5147 @InProceedings{kumar00the, 5148 author = "S. Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan 5149 and Dandapani Sivakumar and Andrew Tomkins and Eli Upfal", 5150 title = "{T}he {W}eb as a Graph", 5151 booktitle = {Proc. PODS}, 5152 publisher = {ACM Press}, 5153 pages = "1--10", 5154 year = "2000", 5155 url = "citeseer.ist.psu.edu/article/kumar00web.html", 5156 } 5157 5158 @InProceedings{mcbryan94genvl, 5159 author = "Oliver A. McBryan", 5160 title = "{GENVL and {WWWW}: {T}ools for Taming the Web}", 5161 booktitle = {Proc. WWW}, 5162 address = "Geneva", 5163 year = "1994", 5164 url = "citeseer.ist.psu.edu/mcbryan94genvl.html", 5165 } 5166 5167 % editor = "O. Nierstarsz", 5168 5169 @InProceedings{bgmz97shingling, 5170 author = {Andrei Z. Broder and Steven C. Glassman and Mark S. Manasse and Geoffrey Zweig}, 5171 title = "{Syntactic clustering of the web}", 5172 booktitle = {Proc. WWW}, 5173 pages = "391--404", 5174 year = "1997", 5175 } 5176 5177 @Article{792552, 5178 author = {Andrei Broder}, 5179 title = {A taxonomy of web search}, 5180 publisher = {ACM Press}, 5181 journal = {SIGIR Forum}, 5182 volume = {36}, 5183 number = {2}, 5184 year = {2002}, 5185 issn = {0163-5840}, 5186 pages = {3--10}, 5187 doi = {doi.acm.org/10.1145/792550.792552}, 5188 address = {New York, NY}, 5189 } 5190 5191 @Article{440656, 5192 author = {Andrei Broder and S. Ravi Kumar and Farzin Maghoul and Prabhakar Raghavan 5193 and Sridhar Rajagopalan and Raymie Stata and Andrew Tomkins and Janet Wiener}, 5194 journal = {Computer Networks}, 5195 month = {June}, 5196 number = {1}, 5197 pages = {309--320}, 5198 title = {Graph structure in the Web}, 5199 volume = {33}, 5200 year = {2000}, 5201 } 5202 5203 @Article{297863, 5204 author = {Krishna Bharat and Andrei Broder}, 5205 title = {A technique for measuring the relative size and overlap of public Web search engines}, 5206 journal = {Computer Networks and ISDN Systems}, 5207 volume = {30}, 5208 number = {1-7}, 5209 year = {1998}, 5210 issn = {0169-7552}, 5211 pages = {379--388}, 5212 doi = {dx.doi.org/10.1016/S0169-7552(98)00127-5}, 5213 publisher = {Elsevier}, 5214 address = {Amsterdam}, 5215 } 5216 5217 @Article{lawrence98searching, 5218 author = "Steve Lawrence and C. Lee Giles", 5219 title = "Searching the {World Wide Web}", 5220 journal = {Science}, 5221 volume = "280", 5222 number = "5360", 5223 pages = "98--100", 5224 year = "1998", 5225 url = "citeseer.ist.psu.edu/lawrence98searching.html", 5226 } 5227 5228 @InProceedings{rusmevichientong01methods, 5229 author = "Paat Rusmevichientong and David M. Pennock and Steve 5230 Lawrence and C. Lee Giles", 5231 title = "Methods for Sampling Pages Uniformly from the World Wide 5232 Web", 5233 booktitle = {Proc. {AAAI} Fall Symposium on Using Uncertainty Within Computation}, 5234 pages = "121--128", 5235 year = "2001", 5236 url = "citeseer.ist.psu.edu/rusmevichientong01methods.html", 5237 } 5238 5239 @Article{ lawrence99giles, 5240 author = "Steve Lawrence and C. Lee Giles", 5241 title = "Accessibility of information on the Web", 5242 journal = {Nature}, 5243 volume = "500", 5244 pages = "107--109", 5245 year = "1999", 5246 } 5247 5248 @InProceedings{346289, 5249 author = {Monika R. Henzinger and Allan Heydon and Michael 5250 Mitzenmacher and Marc Najork}, 5251 title = {On near-uniform {URL} sampling}, 5252 booktitle = {Proc. WWW}, 5253 year = {2000}, 5254 pages = {295--308}, 5255 location = {Amsterdam}, 5256 doi = {dx.doi.org/10.1016/S1389-1286(00)00055-4}, 5257 publisher = {North-Holland}, 5258 address = {Amsterdam, The Netherlands, The Netherlands}, 5259 } 5260 5261 @InProceedings{1135833, 5262 author = {Ziv Bar-Yossef and Maxim Gurevich}, 5263 title = {Random sampling from a search engine's index}, 5264 booktitle = {Proc. WWW}, 5265 publisher = {ACM Press}, 5266 year = {2006}, 5267 isbn = {1-59593-323-9}, 5268 pages = {367--376}, 5269 location = {Edinburgh}, 5270 doi = {doi.acm.org/10.1145/1135777.1135833}, 5271 address = {New York, NY}, 5272 } 5273 5274 @Article{bharat00comparison, 5275 author = "Krishna Bharat and Andrei Z. Broder and Jeffrey Dean and Monika Rauch Henzinger", 5276 title = "A comparison of techniques to find mirrored hosts on the {WWW}", 5277 journal = {JASIS}, 5278 volume = "51", 5279 number = "12", 5280 pages = "1114-1122", 5281 year = "2000", 5282 url = "citeseer.ist.psu.edu/bharat99comparison.html", 5283 } 5284 5285 @InProceedings{511464, 5286 author = {Junghoo Cho and Hector Garcia-Molina}, 5287 title = {Parallel crawlers}, 5288 booktitle = {Proc. WWW}, 5289 publisher = {ACM Press}, 5290 year = {2002}, 5291 isbn = {1-58113-449-5}, 5292 pages = {124--135}, 5293 location = {Honolulu, HI}, 5294 doi = {doi.acm.org/10.1145/511446.511464}, 5295 address = {New York, NY}, 5296 } 5297 5298 @InProceedings{carmel01static, 5299 author = {David Carmel and Doron Cohen and Ronald Fagin and Eitan 5300 Farchi and Michael Herscovici and Yoelle S. Maarek and Aya 5301 Soffer}, 5302 title = {Static index pruning for information retrieval systems}, 5303 booktitle = {Proc. SIGIR}, 5304 publisher = {ACM Press}, 5305 year = {2001}, 5306 isbn = {1-58113-331-6}, 5307 pages = {43--50}, 5308 location = {New Orleans, LA}, 5309 doi = {doi.acm.org/10.1145/383952.383958}, 5310 address = {New York, NY}, 5311 } 5312 5313 @Book{ friedl06regular, 5314 author = {Jeffrey E. F. Friedl}, 5315 year = 2006, 5316 edition = {3rd}, 5317 title = {Mastering Regular Expressions}, 5318 publisher = {O'Reilly}, 5319 address = {Sebastopol, CA}, 5320 } 5321 5322 @InProceedings{comperm, 5323 author = {Paolo Ferragina and Rossano Venturini}, 5324 title = {Compressed permuterm indexes}, 5325 booktitle = {Proc. SIGIR}, 5326 publisher = {ACM Press}, 5327 year = {2007}, 5328 address = {New York, NY}, 5329 } 5330 5331 @Article{ maron60relevance, 5332 author = {Maron, M. E. and Kuhns, J. L.}, 5333 title = {On relevance, probabilistic indexing, and information 5334 retrieval}, 5335 journal = {JACM}, 5336 volume = 7, 5337 number = 3, 5338 pages = {216--244}, 5339 year = 1960, 5340 } 5341 5342 @Article{ blair85evaluation, 5343 author = {David C. Blair and M. E. Maron}, 5344 year = 1985, 5345 title = {An Evaluation of Retrieval Effectiveness for a Full-Text 5346 Document-Retrieval System}, 5347 journal = {CACM}, 5348 volume = 28, 5349 number = 3, 5350 pages = {289--299}, 5351 } 5352 5353 @Book{ siegel88nonparametric, 5354 author = {Sidney Siegel and Castellan, Jr., N. John}, 5355 title = "Nonparametric Statistics for the Behavioral Sciences", 5356 edition = {2nd}, 5357 publisher = {McGraw Hill}, 5358 address = {New York}, 5359 year = "1988", 5360 } 5361 5362 @Article{ voorhees00variations, 5363 author = {Ellen M. Voorhees}, 5364 title = {Variations in Relevance Judgments and the Measurement of 5365 Retrieval Effectiveness}, 5366 journal = {IP\&M}, 5367 volume = {36}, 5368 pages = { 697--716}, 5369 year = {2000}, 5370 } 5371 5372 @InProceedings{ aslam05geometric, 5373 author = {Javed A. Aslam and Emine Yilmaz}, 5374 year = 2005, 5375 title = {A geometric interpretation and analysis of {R}-precision}, 5376 booktitle = {Proc. CIKM}, 5377 publisher = {ACM Press}, 5378 pages = {664--671}, 5379 } 5380 5381 @InProceedings{ strohman07efficient, 5382 author = {Trevor Strohman and Croft, W. Bruce}, 5383 title = {Efficient Document Retrieval in Main Memory}, 5384 booktitle = {Proc. SIGIR}, 5385 publisher = {ACM Press}, 5386 pages = {175--182}, 5387 year = 2007, 5388 } 5389 5390 @Book{ gusfield97algorithms, 5391 author = {Dan Gusfield}, 5392 title = {Algorithms on Strings, Trees and Sequences: {C}omputer 5393 Science and Computational Biology}, 5394 year = 1997, 5395 publisher = {Cambridge University Press}, 5396 address = {Cambridge}, 5397 } 5398 5399 @TechReport{ lee88experimental, 5400 author = {Lee, Whay C. and Fox, Edward A.}, 5401 year = 1988, 5402 title = {Experimental Comparison of Schemes for Interpreting 5403 {B}oolean Queries}, 5404 number = {TR-88-27}, 5405 institution = {Computer Science, Virginia Polytechnic Institute and State 5406 University}, 5407 } 5408 5409 @Book{ hopcroft00automata, 5410 author = {John E. Hopcroft and Rajeev Motwani and Jeffrey D. 5411 Ullman}, 5412 title = {Introduction to Automata Theory, Languages, and 5413 Computation}, 5414 publisher = {Addison Wesley}, 5415 edition = {2nd}, 5416 year = 2000, 5417 } 5418 5419 @Article{ johnson06effective, 5420 author = {Johnson, David and Malhotra, Vishv and Vamplew, Peter}, 5421 year = 2006, 5422 title = {More Effective Web Search Using Bigrams and Trigrams}, 5423 journal = {Webology}, 5424 volume = 3, 5425 number = {4}, 5426 note = {Article 35}, 5427 url = {www.webology.ir/2006/v3n4/a35.html}, 5428 } 5429 5430 @InProceedings{ kammenhuber06web, 5431 author = {Nils Kammenhuber and Julia Luxenburger and Anja Feldmann 5432 and Gerhard Weikum}, 5433 title = {Web search clickstreams}, 5434 booktitle = {Proc. ACM SIGCOMM on Internet Measurement}, 5435 publisher = {ACM Press}, 5436 pages = {245--250}, 5437 year = 2006, 5438 address = {Rio de Janeiro, Brazil}, 5439 } 5440 5441 @article{silverstein99analysis, 5442 author = {Craig Silverstein and 5443 Monika Rauch Henzinger and 5444 Hannes Marais and 5445 Michael Moricz}, 5446 title = {Analysis of a Very Large Web Search Engine Query Log}, 5447 journal = {SIGIR Forum}, 5448 volume = {33}, 5449 number = {1}, 5450 year = {1999}, 5451 pages = {6-12}, 5452 ee = {db/journals/sigir/SilversteinHMM99.html}, 5453 bibsource = {DBLP, http://dblp.uni-trier.de}, 5454 } 5455 5456 @TechReport{ silverstein98analysis, 5457 author = {Craig Silverstein and Monika Henzinger and Hannes Marais 5458 and Michael Moricz}, 5459 title = {Analysis of a Very Large {A}lta{V}ista Query Log}, 5460 year = {1998}, 5461 institution = {Digital SRC}, 5462 number = {1998-014}, 5463 } 5464 5465 @Article{ luk02comparison, 5466 title = {A comparison of {C}hinese document indexing strategies and 5467 retrieval models}, 5468 author = {Robert W. P. Luk and Kui-Lam Kwok}, 5469 journal = {ACM Transactions on Asian Language Information 5470 Processing}, 5471 year = {2002}, 5472 volume = 1, 5473 number = 3, 5474 pages = {225--268}, 5475 } 5476 5477 @InProceedings{ kishida05clir, 5478 title = {Overview of {CLIR} Task at the Fifth {NTCIR} Workshop}, 5479 author = {Kazuaki Kishida and Kuang-Hua Chen and Sukhoon Lee and 5480 Kazuko Kuriyama and Noriko Kando and Hsin-Hsi Chen and Sung 5481 Hyon Myaeng}, 5482 booktitle = {{NTCIR} Workshop Meeting on Evaluation of Information Access Technologies: {I}nformation Retrieval, Question Answering and Cross-Lingual Information Access}, 5483 year = {2005}, 5484 publisher = {National Institute of Informatics}, 5485 address = {Tokyo}, 5486 } 5487 5488 @Misc{ sifry07state, 5489 author = {Dave Sifry}, 5490 year = {2007}, 5491 title = {The State of the {L}ive {W}eb, {A}pril 2007}, 5492 url = {technorati.com/weblog/2007/04/328.html}, 5493 } 5494 5495 @Article{ gerrand07estimating, 5496 author = {Gerrand, Peter}, 5497 year = {2007}, 5498 title = {Estimating linguistic diversity on the Internet: {A} 5499 taxonomy to avoid pitfalls and paradoxes}, 5500 journal = {Journal of Computer-Mediated Communication}, 5501 volume = {12}, 5502 number = {4}, 5503 note = {article 8}, 5504 url = {jcmc.indiana.edu/vol12/issue4/gerrand.html}, 5505 } 5506 5507 @Article{ hollink04monolingual, 5508 author = {Vera Hollink and Jaap Kamps and Christof Monz and Maarten 5509 de Rijke}, 5510 title = {Monolingual Document Retrieval for {E}uropean Languages}, 5511 journal = {IR}, 5512 volume = {7}, 5513 number = {1}, 5514 pages = {33--52}, 5515 year = 2004, 5516 } 5517 5518 @InProceedings{ tomlinson03lexical, 5519 author = {Stephen Tomlinson}, 5520 title = {Lexical and Algorithmic Stemming Compared for 9 {E}uropean 5521 Languages with {H}ummingbird {S}earchServer at {CLEF 5522 2003}}, 5523 booktitle = {Proc. Cross-Language Evaluation Forum}, 5524 pages = {286--300}, 5525 year = 2003, 5526 } 5527 5528 @Article{ barilan05how, 5529 author = {Judit Bar-Ilan and Tatyana Gutman}, 5530 title = {How do search engines respond to some non-{E}nglish 5531 queries?}, 5532 journal = {Journal of Information Science}, 5533 year = 2005, 5534 volume = 31, 5535 number = 1, 5536 pages = {13--28}, 5537 } 5538 5539 @Book{ jackson02natural, 5540 author = {Jackson, Peter and Isabelle Moulinier}, 5541 year = 2002, 5542 title = {Natural Language Processing for Online Applications: {T}ext 5543 Retrieval, Extraction and Categorization}, 5544 publisher = {John Benjamins}, 5545 isbn = {1-58811-250-0}, 5546 } 5547 5548 @InProceedings{ hayes90construe, 5549 author = {Philip J. Hayes and Steven P. Weinstein}, 5550 year = 1990, 5551 title = {{CONSTRUE/TIS}: {A} System for Content-Based Indexing of a 5552 Database of News Stories}, 5553 booktitle = {Proc. Conference on Innovative Applications of Artificial Intelligence}, 5554 pages = {49--66}, 5555 stanford = {Green or Math Q334 .I5433 1990}, 5556 } 5557 5558 @InProceedings{ klein02conditional, 5559 author = {Dan Klein and Christopher D. Manning}, 5560 year = 2002, 5561 title = {Conditional Structure versus Conditional Estimation in 5562 {NLP} Models}, 5563 booktitle = {Proc. Empirical Methods in Natural Language Processing}, 5564 pages = {9--16}, 5565 } 5566 5567 @InProceedings{ banko01scaling, 5568 author = {Michele Banko and Eric Brill}, 5569 year = 2001, 5570 title = {Scaling to Very Very Large Corpora for Natural Language 5571 Disambiguation}, 5572 booktitle = {Proc. ACL}, 5573 } 5574 5575 @InCollection{ joachims99making, 5576 author = {Thorsten Joachims}, 5577 year = 1999, 5578 title = {Making large-Scale {SVM} Learning Practical}, 5579 booktitle = {Advances in Kernel Methods - Support Vector Learning}, 5580 editor = {B. Sch{\"o}lkopf and C. Burges and A. Smola}, 5581 publisher = {MIT Press}, 5582 } 5583 5584 @Article{ garfield55, 5585 title = {Citation indexes to science: {A} new dimension in 5586 documentation through association of ideas}, 5587 author = {Eugene Garfield}, 5588 journal = {Science}, 5589 year = {1955}, 5590 volume = 122, 5591 pages = {108--111}, 5592 } 5593 5594 @Article{ pinskinarin, 5595 title = {Citation Influence for Journal Aggregates of Scientific 5596 Publications: {T}heory, with Application to the Literature 5597 of {P}hysics}, 5598 author = {Gabriel Pinski and Francis Narin}, 5599 journal = {IP\&M}, 5600 year = {1976}, 5601 volume = 12, 5602 pages = {297--326}, 5603 } 5604 5605 @Article{ kumar99trawling, 5606 author = "Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan and Andrew Tomkins", 5607 title = "Trawling the {Web} for emerging cyber-communities", 5608 journal = {Computer Networks}, 5609 volume = "31", 5610 number = "11--16", 5611 pages = "1481--1493", 5612 year = "1999", 5613 url = "citeseer.ist.psu.edu/kumar99trawling.html", 5614 } 5615 5616 @Article{ jacobs90scisor, 5617 author = {Paul S. Jacobs and Lisa F. Rau}, 5618 title = {{SCISOR}: {E}xtracting Information from On-line News}, 5619 journal = {CACM}, 5620 year = 1990, 5621 volume = 33, 5622 pages = {88--97}, 5623 } 5624 5625 @Article{ mooers50coding, 5626 author = {Mooers, Calvin E.}, 5627 year = 1950, 5628 title = {Coding, Information Retrieval, and the Rapid Selector}, 5629 journal = {American Documentation}, 5630 volume = 1, 5631 number = 4, 5632 pages = {225--229}, 5633 } 5634 5635 @Article{ kent55operational, 5636 author = {Allen Kent and Madeline M. Berry and Luehrs, Jr., Fred U. and J. W. Perry}, 5637 year = 1955, 5638 title = {Machine Literature Searching {VIII}. {O}perational Criteria for Designing Information Retrieval Systems}, 5639 journal = {American Documentation}, 5640 volume = 6, 5641 number = 2, 5642 pages = {93--101}, 5643 } 5644 5645 @Article{ swanson88historical, 5646 author = {Don R. Swanson}, 5647 year = 1988, 5648 title = {Historical Note: {I}nformation Retrieval and the Future of 5649 an Illusion}, 5650 journal = {JASIS}, 5651 volume = 39, 5652 number = 2, 5653 pages = {92--98}, 5654 } 5655 5656 @incollection{ littman98automatic, 5657 author = {Michael L. Littman and Susan T. Dumais and Thomas K. Landauer}, 5658 title = "Automatic cross-language information retrieval using 5659 latent semantic indexing", 5660 editor = {Gregory Grefenstette}, 5661 booktitle = {Proc. Cross-Language Information Retrieval}, 5662 year = "1998", 5663 url = "citeseer.ist.psu.edu/littman98automatic.html", 5664 publisher = {Kluwer}, 5665 } 5666 5667 @Article{ berryyoung1995, 5668 author = {Michael Berry and Paul Young}, 5669 year = 1995, 5670 title = {Using latent semantic indexing for multilanguage 5671 information retrieval}, 5672 journal = {Computers and the Humanities}, 5673 volume = 29, 5674 number = 6, 5675 pages = {413--429}, 5676 } 5677 5678 @Book{ kemenysnell, 5679 title = {Finite {M}arkov Chains}, 5680 address = {New York}, 5681 author = {John G. Kemeny and J. Laurie Snell}, 5682 publisher = {Springer}, 5683 year = {1976}, 5684 } 5685 5686 @PhDThesis{brown95, 5687 author = {Eric W. Brown}, 5688 title = {Execution Performance Issues in Full-Text Information 5689 Retrieval}, 5690 school = {University of Massachusetts, Amherst}, 5691 year = 1995, 5692 } 5693 5694 @Article{ berkhinbsa, 5695 author = {Pavel Berkhin}, 5696 year = 2006, 5697 title = {Bookmark-Coloring Algorithm for Personalized Pagerank 5698 Computing}, 5699 journal = {Internet Mathematics}, 5700 volume = 3, 5701 number = 1, 5702 pages = {41--62}, 5703 } 5704 5705 @InProceedings{ murata00japanese, 5706 author = {Masaki Murata and Qing Ma and Kiyotaka Uchimoto and Hiromi 5707 Ozaku and Masao Utiyama and Hitoshi Isahara}, 5708 year = 2000, 5709 title = {Japanese probabilistic information retrieval using 5710 location and category information}, 5711 booktitle = {International Workshop on Information Retrieval With {A}sian Languages}, 5712 pages = {81--88}, 5713 url = {portal.acm.org/citation.cfm?doid=355214.355226}, 5714 annote = {Improves ad hoc IR results (IREX) by upweighting terms in 5715 title and first sentence of newswire docs. Doesn't clearly 5716 distinguish the effectiveness of the two, but}, 5717 } 5718 5719 @Article{ ko04improving, 5720 author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo}, 5721 title = {Improving text categorization using the importance of 5722 sentences}, 5723 year = 2004, 5724 journal = {IP\&M}, 5725 volume = 40, 5726 number = 1, 5727 pages = {65--79}, 5728 } 5729 5730 @Article{ cohen99context, 5731 author = {William W. Cohen and Yoram Singer}, 5732 year = {1999}, 5733 title = {Context-Sensitive Learning Methods for Text 5734 Categorization}, 5735 journal = {TOIS}, 5736 volume = 17, 5737 number = 2, 5738 pages = {141--173}, 5739 } 5740 5741 @InProceedings{ kolcz01summarization, 5742 author = {Ko{\l}cz, Aleksander and Prabakarmurthi, Vidya and Kalita, 5743 Jugal}, 5744 title = {Summarization as feature selection for text 5745 categorization}, 5746 booktitle = {Proc. CIKM}, 5747 publisher = {ACM Press}, 5748 year = 2000, 5749 pages = {365--370}, 5750 } 5751 5752 @Article{ kozlov79polynomial, 5753 author = {Kozlov, M. K. and Tarasov, S. P. and Khachiyan, L. G.}, 5754 title = {Polynomial Solvability of Convex Quadratic Programming}, 5755 journal = {Soviet Mathematics Doklady}, 5756 volume = 20, 5757 year = 1979, 5758 pages = {1108--1111}, 5759 note = {Translated from original in \emph{Doklady Akademiia Nauk 5760 SSR}, 228 (1979)}, 5761 } 5762 5763 @InProceedings{ kolcz07raising, 5764 author = {Aleksander Ko{\l}cz and {Wen-Tau} Yih}, 5765 title = {Raising the Baseline for High-Precision Text Classifiers}, 5766 booktitle = {Proc. KDD}, 5767 year = 2007, 5768 } 5769 5770 @InCollection{ platt00probabilistic, 5771 author = {John Platt}, 5772 title = {Probabilistic outputs for support vector machines and 5773 comparisons to regularized likelihood methods}, 5774 editor = {A.J. Smola and P.L. Bartlett and B. Sch{\"o}lkopf and D. 5775 Schuurmans}, 5776 booktitle = {Advances in Large Margin Classifiers}, 5777 pages = {61--74}, 5778 publisher = {MIT Press}, 5779 address = {Cambridge, MA}, 5780 year = 2000, 5781 } 5782 5783 @InProceedings{ weston99svms, 5784 author = {Jason Weston and Chris Watkins}, 5785 title = {Support Vector Machines for Multi-class Pattern 5786 Recognition}, 5787 year = 1999, 5788 booktitle = {Proc. European Symposium on Artificial Neural Networks}, 5789 pages = {219--224}, 5790 } 5791 5792 @Article{ crammer01algorithmic, 5793 author = {Koby Crammer and Yoram Singer}, 5794 year = 2001, 5795 title = {On the algorithmic implementation of multiclass 5796 kernel-based machines}, 5797 journal = {JMLR}, 5798 volume = 2, 5799 pages = {265--292}, 5800 } 5801 5802 @InProceedings{ geng07feature, 5803 author = {Xiubo Geng and Tie-Yan Liu and Tao Qin and Hang Li}, 5804 title = {Feature Selection for Ranking}, 5805 booktitle = {Proc. SIGIR}, 5806 publisher = {ACM Press}, 5807 pages = {407--414}, 5808 year = 2007, 5809 } 5810 5811 @Article{ jarvelin02cumulated, 5812 author = {Kalervo J{\"a}rvelin and Jaana Kek{\"a}l{\"a}inen}, 5813 title = {Cumulated gain-based evaluation of {IR} techniques}, 5814 journal = {TOIS}, 5815 year = 2002, 5816 volume = 20, 5817 number = 4, 5818 pages = {422--446}, 5819 } 5820 5821 @Article{ kekalainen02graded, 5822 author = {Jaana Kek{\"a}l{\"a}inen and Kalervo J{\"a}rvelin}, 5823 title = {Using Graded Relevance Assessments in {IR} Evaluation}, 5824 journal = {JASIST}, 5825 year = 2002, 5826 volume = 53, 5827 number = 13, 5828 pages = {1120--1129}, 5829 } 5830 5831 @InProceedings{ burges05learning, 5832 author = {Chris Burges and Tal Shaked and Erin Renshaw and Ari Lazier and Matt Deeds and Nicole Hamilton and Greg Hullender}, 5833 title = {Learning to rank using gradient descent}, 5834 booktitle = {Proc. ICML}, 5835 year = 2005, 5836 } 5837 5838 5839 5840 @InCollection{herbrich00large, 5841 author = {Ralf Herbrich and Thore Graepel and Klaus Obermayer}, 5842 year = 2000, 5843 title = {Large margin rank boundaries for ordinal regression}, 5844 booktitle = {Advances in Large Margin Classifiers}, 5845 publisher = {MIT Press}, 5846 address = {Cambridge, MA}, 5847 pages = {115--132}, 5848 } 5849 5850 @InProceedings{ yue07svm, 5851 author = {Yisong Yue and Thomas Finley and Filip Radlinski and 5852 Thorsten Joachims}, 5853 title = {A Support Vector Method for Optimizing Average Precision}, 5854 booktitle = {Proc. SIGIR}, 5855 publisher = {ACM Press}, 5856 year = 2007, 5857 } 5858 5859 @InProceedings{ taylor06optimisation, 5860 author = {Michael Taylor and Hugo Zaragoza and Nick Craswell and 5861 Stephen Robertson and Chris Burges}, 5862 title = {Optimisation methods for ranking functions with multiple 5863 parameters}, 5864 booktitle = {Proc. CIKM}, 5865 publisher = {ACM Press}, 5866 year = 2006, 5867 } 5868 5869 @InProceedings{ wong88linear, 5870 author = {S. K. Michael Wong and Yiyu Yao and Peter Bollmann}, 5871 title = {Linear Structure in Information Retrieval}, 5872 booktitle = {Proc. SIGIR}, 5873 publisher = {ACM Press}, 5874 year = 1988, 5875 pages = {219-232}, 5876 } 5877 5878 @InProceedings{ gey94inferring, 5879 author = {Fredric C. Gey}, 5880 title = {Inferring Probability of Relevance Using the Method of 5881 Logistic Regression}, 5882 booktitle = {Proc. SIGIR}, 5883 publisher = {ACM Press}, 5884 year = 1994, 5885 pages = {222--231}, 5886 } 5887 5888 @InProceedings{ cao06adapting, 5889 author = {Yunbo Cao and Jun Xu and Tie-Yan Liu and Hang Li and Yalou 5890 Huang and Hsiao-Wuen Hon}, 5891 title = {Adapting {R}anking {SVM} to Document Retrieval}, 5892 booktitle = {Proc. SIGIR}, 5893 publisher = {ACM Press}, 5894 year = 2006, 5895 } 5896 5897 @InProceedings{ qin07ranking, 5898 author = {Tao Qin and Tie-Yan Liu and Wei Lai and Xu-Dong Zhang and 5899 De-Sheng Wang and Hang Li}, 5900 title = {Ranking with Multiple Hyperplanes}, 5901 booktitle = {Proc. SIGIR}, 5902 publisher = {ACM Press}, 5903 year = 2007, 5904 } 5905 5906 @incollection{indyk04nearest, 5907 author = {Piotr Indyk}, 5908 title = {Nearest Neighbors in High-Dimensional Spaces}, 5909 booktitle = {Handbook of Discrete and Computational Geometry}, 5910 edition = {2nd}, 5911 pages = {877--892}, 5912 editor = {J. E. Goodman and J. O'Rourke}, 5913 publisher = {Chapman and Hall/CRC Press}, 5914 address = {New York}, 5915 year = {2004} 5916 } 5917 %% Chris reinserted missing references 5918 5919 @article{godoy06modeling, 5920 author = {Daniela Godoy and Anal{\'i}a Amandi}, 5921 title = {Modeling user interests by conceptual clustering}, 5922 journal = {Information Systems}, 5923 volume = {31}, 5924 number = {4}, 5925 year = {2006}, 5926 issn = {0306-4379}, 5927 pages = {247--265}, 5928 doi = {dx.doi.org/10.1016/j.is.2005.02.008}, 5929 publisher = {Elsevier Science}, 5930 address = {Oxford, UK, UK}, 5931 } 5932 5933 @inproceedings{fang04formal, 5934 author = {Hui Fang and Tao Tao and ChengXiang Zhai}, 5935 title = {A formal study of information retrieval heuristics}, 5936 booktitle = {Proc. SIGIR}, 5937 publisher = {ACM Press}, 5938 year = {2004}, 5939 isbn = {1-58113-881-4}, 5940 pages = {49--56}, 5941 location = {Sheffield, United Kingdom}, 5942 doi = {doi.acm.org/10.1145/1008992.1009004}, 5943 address = {New York, NY}, 5944 } 5945 5946 @article{sauvagnat06answering, 5947 author = {Karen Sauvagnat and Mohand Boughanem and Claude Chrisment}, 5948 title = {Answering content and structure-based queries on {XML} documents using relevance propagation}, 5949 journal = {Information Systems}, 5950 volume = {31}, 5951 number = {7}, 5952 year = {2006}, 5953 issn = {0306-4379}, 5954 pages = {621--635}, 5955 doi = {dx.doi.org/10.1016/j.is.2005.11.007}, 5956 publisher = {Elsevier Science}, 5957 address = {Oxford, UK}, 5958 } 5959 5960 @InProceedings{westerveld07tijah, 5961 author = {Thijs Westerveld and Henning Rode and Roel van Os 5962 and Djoerd Hiemstra and Georgina Ram{\'\i}rez and 5963 Vojkan Mihajlovic and Arjen P. de Vries}, 5964 title = {Evaluating Structured Information Retrieval and 5965 Multimedia Retrieval using {PF/Tijah}}, 5966 year = 2007, 5967 pages = {104--114}, 5968 crossref = {fuhr07comparative}, 5969 } 5970 5971 @inproceedings{vogt99user, 5972 author = "Christopher C. Vogt and Garrison W. Cottrell and Richard K. Belew and Brian 5973 T. Bartell", 5974 title = "User Lenses -- {A}chieving 100\% Precision on Frequently Asked Questions", 5975 year=1999, 5976 booktitle = {Proc. International Conference on User Modelling}, 5977 } 5978 5979 @article{cambazoglu06performance, 5980 author = {{Berkant Barla} Cambazoglu and 5981 Cevdet Aykanat}, 5982 title = {Performance of query processing implementations in ranking-based 5983 text retrieval systems using inverted indices.}, 5984 journal = {IP\&M}, 5985 volume = {42}, 5986 number = {4}, 5987 year = {2006}, 5988 pages = {875-898}, 5989 doi = {dx.doi.org/10.1016/j.ipm.2005.06.004}, 5990 bibsource = {DBLP, http://dblp.uni-trier.de}, 5991 } 5992 5993 @inproceedings{fradkin03experiments, 5994 author = {Dmitriy Fradkin and David Madigan}, 5995 title = {Experiments with random projections for machine learning}, 5996 booktitle = {Proc. KDD}, 5997 publisher = {ACM Press}, 5998 year = {2003}, 5999 isbn = {1-58113-737-0}, 6000 pages = {517--522}, 6001 location = {Washington, D.C.}, 6002 doi = {doi.acm.org/10.1145/956750.956812}, 6003 address = {New York, NY}, 6004 } 6005 6006 @inproceedings{bennett99densitybased, 6007 author = {Kristin P. Bennett and Usama Fayyad and Dan Geiger}, 6008 title = {Density-based indexing for approximate nearest-neighbor queries}, 6009 booktitle = {Proc. KDD}, 6010 publisher = {ACM Press}, 6011 year = {1999}, 6012 isbn = {1-58113-143-7}, 6013 pages = {233--243}, 6014 location = {San Diego, California, United States}, 6015 doi = {doi.acm.org/10.1145/312129.312236}, 6016 address = {New York, NY}, 6017 } 6018 6019 @inproceedings{buckley85optimization, author = {Chris Buckley 6020 and Alan F. Lewit}, title = {Optimization of inverted vector 6021 searches}, 6022 booktitle = {Proc. SIGIR}, 6023 year = {1985}, isbn = 6024 {0-89791-159-8}, pages = {97--110}, location = {Montreal}, doi 6025 = {doi.acm.org/10.1145/253495.253515}, publisher = {ACM 6026 Press}, address = {New York, NY}, } 6027 6028 @inproceedings{guttman84rtrees, 6029 author = {Antonin Guttman}, 6030 title = {R-trees: {A} dynamic index structure for spatial searching}, 6031 booktitle = {Proc. SIGMOD}, 6032 publisher = {ACM Press}, 6033 year = {1984}, 6034 isbn = {0-89791-128-8}, 6035 pages = {47--57}, 6036 location = {Boston, Massachusetts}, 6037 doi = {doi.acm.org/10.1145/602259.602266}, 6038 address = {New York, NY}, 6039 } 6040 6041 @article{fuernkranz02round, 6042 author = {Johannes F{\"u}rnkranz}, 6043 title = {Round robin classification}, 6044 journal = {JMLR}, 6045 volume = {2}, 6046 year = {2002}, 6047 issn = {1533-7928}, 6048 pages = {721--747}, 6049 publisher = {MIT Press}, 6050 address = {Cambridge, MA}, 6051 } 6052 6053 @inproceedings{forman04learning, 6054 author = {George Forman and 6055 Ira Cohen}, 6056 title = {Learning from Little: {C}omparison of Classifiers Given Little 6057 Training}, 6058 booktitle = {Proc. PKDD}, 6059 year = {2004}, 6060 pages = {161-172}, 6061 ee = {springerlink.metapress.com/openurl.asp?genre=article{\&}issn=0302-9743{\&}volume=3202{\&}spage=161}, 6062 bibsource = {DBLP, http://dblp.uni-trier.de}, 6063 } 6064 6065 @inproceedings{bottou94convergence, 6066 author = {L{\'e}on Bottou and 6067 Yoshua Bengio}, 6068 title = {Convergence Properties of the K-Means Algorithms}, 6069 booktitle = {Proc. NIPS}, 6070 year = {1994}, 6071 pages = {585-592}, 6072 ee = {nips.djvuzone.org/djvu/nips07/0585.djvu}, 6073 bibsource = {DBLP, http://dblp.uni-trier.de}, 6074 } 6075 6076 @article{nigam00em, 6077 author = {Kamal Nigam and Andrew Kachites McCallum and Sebastian Thrun and Tom Mitchell}, 6078 title = {Text Classification from Labeled and Unlabeled Documents using EM}, 6079 journal = {Machine Learning}, 6080 volume = {39}, 6081 number = {2-3}, 6082 year = {2000}, 6083 issn = {0885-6125}, 6084 pages = {103--134}, 6085 publisher = {Kluwer}, 6086 address = {Hingham, MA}, 6087 } 6088 6089 @inproceedings{weiss03web, 6090 editor = "Mieczys{\l{}}aw A. K{\l{}}opotek and S{\l{}}awomir T. Wierzcho{{\'n}} and Krzysztof Trojanowski", 6091 author = "Dawid Weiss and Jerzy Stefanowski", 6092 title = "Web Search Results Clustering in {P}olish: {E}xperimental evaluation of {C}arrot", 6093 booktitle = {Proc. New Trends in Intelligent Information Processing and Web Mining Conference}, 6094 year = "2003", 6095 } 6096 6097 @inproceedings{ azcarraga01extracting, 6098 author = "Arnulfo P. Azcarraga and Teddy N. {Yap Jr.}", 6099 title = "Extracting Meaningful Labels for {WEBSOM} Text Archives", 6100 booktitle = {Proc. CIKM}, 6101 publisher = {ACM Press}, 6102 pages = "41-48", 6103 year = 2001, 6104 url = "citeseer.ist.psu.edu/azcarraga01extracting.html" , 6105 } 6106 6107 @inproceedings{roos06compression, 6108 author = {Teemu Roos and 6109 Tuomas Heikkil{\"a} and 6110 Petri Myllym{\"a}ki}, 6111 title = {A Compression-Based Method for Stemmatic Analysis}, 6112 booktitle = {Proc. ECAI}, 6113 year = {2006}, 6114 pages = {805-806}, 6115 bibsource = {DBLP, http://dblp.uni-trier.de}, 6116 } 6117 6118 @Article{ hand01idiot, 6119 author = {David J. Hand and Keming Yu}, 6120 title = {Idiot's {Bayes}: {N}ot So Stupid after All}, 6121 year = 2001, 6122 journal = {International Statistical Review}, 6123 volume = 69, 6124 number = 3, 6125 pages = {385--398}, 6126 } 6127 6128 @inproceedings{pavlov04document, 6129 author = {Dmitry Pavlov and Ramnath Balasubramanyan and Byron Dom and 6130 Shyam Kapur and Jignashu Parikh}, 6131 title = {Document Preprocessing For Naive {Bayes} Classification and 6132 Clustering with Mixture of Multinomials}, 6133 booktitle = {Proc. KDD}, 6134 pages = {829-834}, 6135 year = {2004}, 6136 } 6137 6138 @Proceedings{ismir07, 6139 title = {International Conference on Music Information Retrieval (ISMIR 2007)}, 6140 year = {2007}, 6141 editor = {Simon Dixon and David Bainbridge and Rainer Typke}, 6142 isbn = {978-3-85403-218}, 6143 url = {ismir2007.ismir.net/} 6144 } 6145 6146 @article{downie06music, 6147 author = {J. Stephen Downie}, 6148 title = {The {M}usic {I}nformation {R}etrieval {E}valuation 6149 e{X}change ({MIREX})}, 6150 journal = {D-Lib Magazine}, 6151 year = 2006, 6152 month = {December}, 6153 volume = 12, 6154 number = 12, 6155 issn = {1082-9873}, 6156 } 6157 6158 @book{bimbo99visual, 6159 author = {del Bimbo, Alberto}, 6160 title = {Visual Information Retrieval}, 6161 year = {1999}, 6162 publisher = {Morgan Kaufmann}, 6163 } 6164 6165 @book{lew01principles, 6166 author = {Michael S. Lew}, 6167 title = {Principles of Visual Information Retrieval}, 6168 year = {2001}, 6169 publisher = {Springer}, 6170 } 6171 6172 @book{coden02speech, 6173 title = {Information Retrieval Techniques for Speech Applications}, 6174 editor = {Anni R. Coden and Eric W. Brown and Savitha Srinivasan}, 6175 year = 2002, 6176 publisher = {Springer}, 6177 } 6178 6179 @book{lesk04understanding, 6180 author = {Michael Lesk}, 6181 title = {Understanding Digital Libraries}, 6182 year = 2004, 6183 edition = {2nd}, 6184 publisher = {Morgan Kaufmann} 6185 } 6186 @article{levenshtein66binary, 6187 author = {Vladimir I. Levenshtein}, 6188 journal = {Soviet Physics Doklady}, 6189 number = {8}, 6190 pages = {707--710}, 6191 title = {Binary codes capable of correcting 6192 deletions, insertions, and reversals}, 6193 volume = {10}, 6194 year = {1966} 6195 } 6196 6197 6198 @article{wagner74string, 6199 author = {Robert A. Wagner and Michael J. Fischer}, 6200 title = {The String-to-String Correction Problem}, 6201 journal = {JACM}, 6202 volume = {21}, 6203 number = {1}, 6204 year = {1974}, 6205 issn = {0004-5411}, 6206 pages = {168--173}, 6207 doi = {doi.acm.org/10.1145/321796.321811}, 6208 publisher = {ACM Press}, 6209 address = {New York, NY, USA}, 6210 } 6211 6212 @article{teh06hdp, 6213 author = {Yee Whye Teh and Michael I. Jordan and Matthew J. Beal and David M. Blei}, 6214 title = {Hierarchical {D}irichlet Processes}, 6215 journal = {Journal of the American Statistical Association}, 6216 year = 2006, 6217 volume = 101, 6218 number = 476, 6219 pages = {1566--1581}, 6220 } 6221 6222 @inproceedings{wei06lda, 6223 author = {Xing Wei and W. Bruce Croft}, 6224 title = {{LDA}-based document models for ad-hoc retrieval}, 6225 booktitle = {Proc. SIGIR}, 6226 year = {2006}, 6227 isbn = {1-59593-369-7}, 6228 pages = {178--185}, 6229 location = {Seattle, Washington, USA}, 6230 doi = {doi.acm.org/10.1145/1148170.1148204}, 6231 publisher = {ACM Press}, 6232 address = {New York, NY, USA}, 6233 } 6234 6235 @inproceedings{lavrenko01relevance, 6236 author = {Lavrenko, Victor and Croft, W. Bruce}, 6237 title = {Relevance-based language models}, 6238 booktitle = {Proc. SIGIR}, 6239 publisher = {ACM Press}, 6240 year = 2001, 6241 pages = {120--127} 6242 } 6243 6244 @inproceedings{zhai02two, 6245 author = {ChengXiang Zhai and John Lafferty}, 6246 title = {Two-stage language models for information retrieval}, 6247 booktitle = {Proc. SIGIR}, 6248 year = {2002}, 6249 isbn = {1-58113-561-0}, 6250 pages = {49--56}, 6251 location = {Tampere, Finland}, 6252 doi = {doi.acm.org/10.1145/564376.564387}, 6253 publisher = {ACM Press}, 6254 address = {New York, NY, USA} 6255 } 6256 6257 @inproceedings{kraaij02importance, 6258 author = {Wessel Kraaij and Thijs Westerveld and Djoerd Hiemstra}, 6259 year = 2002, 6260 title = {The Importance of Prior Probabilities for Entry Page Search}, 6261 booktitle = {Proc. SIGIR}, 6262 publisher = {ACM Press}, 6263 pages = {27--34} 6264 } 6265 6266 @inproceedings{tao06language, 6267 author = {Tao Tao and Xuanhui Wang and Qiaozhu Mei and ChengXiang Zhai}, 6268 title = {Language Model Information Retrieval with Document Expansion}, 6269 booktitle = {Proc. Human Language Technology Conference / North American Chapter of the Association for Computational Linguistics}, 6270 year = 2006, 6271 pages = {407--414} 6272 } 6273 6274 @incollection{lafferty03probabilistic, 6275 author = {John Lafferty and Chengxiang Zhai}, 6276 title = {Probabilistic relevance models based on document and query 6277 generation}, 6278 editor = {W. Bruce Croft and John Lafferty}, 6279 booktitle = {Language Modeling for Information Retrieval}, 6280 year = 2003, 6281 publisher = {Kluwer} 6282 } 6283 6284 @incollection{kraaij03language, 6285 author = {Wessel Kraaij and Martijn Spitters}, 6286 year = 2003, 6287 title = {Language Models for Topic Tracking}, 6288 booktitle = {Language Modeling for Information Retrieval}, 6289 editor = {W. B. Croft and J. Lafferty}, 6290 pages = {95--124}, 6291 publisher = {Kluwer} 6292 } 6293 6294 6295 @inproceedings{xu99clusterbased, 6296 author = {Jinxi Xu and W. Bruce Croft}, 6297 title = {Cluster-based language models for distributed retrieval}, 6298 booktitle = {Proc. SIGIR}, 6299 year = {1999}, 6300 isbn = {1-58113-096-1}, 6301 pages = {254--261}, 6302 location = {Berkeley, California, United States}, 6303 doi = {doi.acm.org/10.1145/312624.312687}, 6304 publisher = {ACM Press}, 6305 address = {New York, NY, USA}, 6306 } 6307 6308 @article{muresan04topic, 6309 author = {Gheorghe Muresan and David J. Harper}, 6310 title = {Topic modeling for mediated access to very large document collections}, 6311 journal = {JASIST}, 6312 volume = {55}, 6313 number = {10}, 6314 year = {2004}, 6315 issn = {1532-2882}, 6316 pages = {892--910}, 6317 doi = {dx.doi.org/10.1002/asi.20034}, 6318 publisher = {John Wiley \& Sons}, 6319 address = {New York, NY, USA}, 6320 } 6321 6322 6323 @inproceedings{kurland04corpus, 6324 author = {Oren Kurland and Lillian Lee}, 6325 title = {Corpus structure, language models, and ad hoc information retrieval}, 6326 booktitle = {Proc. SIGIR}, 6327 year = {2004}, 6328 isbn = {1-58113-881-4}, 6329 pages = {194--201}, 6330 location = {Sheffield, United Kingdom}, 6331 doi = {doi.acm.org/10.1145/1008992.1009027}, 6332 publisher = {ACM Press}, 6333 address = {New York, NY, USA}, 6334 } 6335 6336 @inproceedings{buckley00evaluating, 6337 author = {Chris Buckley and Ellen M. Voorhees}, 6338 year = 2000, 6339 title = {Evaluating Evaluation Measure Stability}, 6340 booktitle = {Proc. SIGIR}, 6341 pages = {33--40} 6342 } 6343 6344 @InProceedings{tague-sutcliffe95statistical, 6345 author = "Jean Tague-Sutcliffe and James Blustein", 6346 title = "A statistical analysis of the {TREC-3} data", 6347 booktitle = {Proc. TREC}, 6348 pages = "385--398", 6349 year = 1995, 6350 } 6351 6352 @article{sakai07reliability, 6353 author = {Tetsuya Sakai}, 6354 year = 2007, 6355 title = {On the reliability of information retrieval metrics based 6356 on graded relevance}, 6357 journal = {IP\&M}, 6358 volume = 43, 6359 number = 2, 6360 pages = {531--548} 6361 } 6362 6363 @inproceedings{zobel98reliable, 6364 author = {Justin Zobel}, 6365 year = 1998, 6366 title = {How reliable are the results of large-scale information 6367 retrieval experiments?}, 6368 booktitle = {Proc. SIGIR}, 6369 pages = {307--314} 6370 } 6371 6372 @article{schamber90re-examination, 6373 author = {Linda Schamber and Michael Eisenberg and Michael 6374 S. Nilan}, 6375 year = 1990, 6376 title = {A re-examination of relevance: toward a dynamic, 6377 situational definition}, 6378 journal = {IP\&M}, 6379 volume = 26, 6380 number = 6, 6381 pages = {755--776} 6382 } 6383 6384 @inproceedings{hersh00batch, 6385 author = {William R. Hersh and Andrew Turpin and Susan Price and 6386 Benjamin Chan and Dale Kraemer and Lynetta Sacherek 6387 and Daniel Olson}, 6388 title = {Do batch and user evaluation give the same results?}, 6389 booktitle = {Proc. SIGIR}, 6390 YEAR = 2000, 6391 pages = {17--24} 6392 } 6393 6394 @inproceedings{hersh00further, 6395 author = {William R. Hersh and Andrew Turpin and Lynetta Sacherek 6396 and Daniel Olson and Susan Price and Benjamin Chan 6397 and Dale Kraemer}, 6398 title = {Further Analysis of Whether Batch and User Evaluations Give 6399 the Same Results with a Question-Answering Task}, 6400 booktitle = {Proc. TREC}, 6401 year = 2000 6402 } 6403 6404 @article{hersh01challenging, 6405 author = {William R. Hersh and Andrew Turpin and Susan Price and 6406 Dale Kraemer and Daniel Olson and Benjamin Chan and 6407 Lynetta Sacherek}, 6408 title = {Challenging conventional assumptions of automated 6409 information retrieval with real users: Boolean 6410 searching and batch retrieval evaluations}, 6411 journal = {IP\&M}, 6412 volume = 37, 6413 number = 3, 6414 pages = {383--402}, 6415 year = 2001 6416 } 6417 6418 @inproceedings{turpin01why, 6419 author = {Andrew Turpin and William R. Hersh}, 6420 title = {Why Batch and User Evaluations Do Not Give the Same 6421 Results}, 6422 booktitle = {Proc. SIGIR}, 6423 year = 2001, 6424 pages = {225--231} 6425 } 6426 6427 @inproceedings{turpin02user, 6428 author = {Andrew Turpin and William R. Hersh}, 6429 title = {User interface effects in past batch versus user 6430 experiments}, 6431 booktitle = {Proc. SIGIR}, 6432 year = 2002, 6433 pages = {431--432} 6434 } 6435 6436 @incollection{dietterich01ensemble, 6437 author = {Dietterich, T.G.}, 6438 year = 2001, 6439 title = {Ensemble methods in machine learning}, 6440 editor = {Kittler, Josef and Roli, Fabio}, 6441 booktitle = {Multiple Classifier Systems}, 6442 series = {LNCS}, 6443 volume = 1857, 6444 publisher = {Springer}, 6445 pages = {1-15} 6446 } 6447 6448 @incollection{dietterich02ensemble, 6449 author = {Thomas G. Dietterich}, 6450 title = {Ensemble Learning}, 6451 booktitle = {The Handbook of Brain Theory and Neural Networks}, 6452 editor = {Michael A. Arbib}, 6453 publisher = {MIT Press}, 6454 edition = {2nd}, 6455 year = 2002 6456 } 6457 6458 @incollection{schapire03boosting, 6459 author = {Robert E. Schapire}, 6460 title = {The boosting approach to machine learning: An overview}, 6461 editor = {D. D. Denison and M. H. Hansen and C. Holmes and 6462 B. Mallick and B. Yu}, 6463 booktitle = {Nonlinear Estimation and Classification}, 6464 publisher = {Springer}, 6465 year = 2003 6466 } 6467 6468 @article{schapire00boostexter, 6469 author = {Robert E. Schapire and Yoram Singer}, 6470 title = {BoosTexter: A boosting-based system for text 6471 categorization}, 6472 journal = {Machine Learning}, 6473 volume = 39, 6474 number = {2/3}, 6475 pages = {135--168}, 6476 year = 2000 6477 } 6478 6479 @book{chapelle06semi-supervised, 6480 editor = {Olivier Chapelle and Bernhard Sch{\"o}lkopf and Alexander Zien}, 6481 year = 2006, 6482 title = {Semi-Supervised Learning}, 6483 publisher = {MIT Press}, 6484 address = {Cambridge, MA} 6485 } 6486 6487 @incollection{nigam06semi-supervised, 6488 author = {Kamal Nigam and Andrew McCallum and Tom Mitchell}, 6489 title = {Semi-supervised Text Classification Using {EM}}, 6490 crossref = {chapelle06semi-supervised}, 6491 pages = {33--56} 6492 } 6493 6494 @incollection{joachims06transductive, 6495 author = {Thorsten Joachims}, 6496 title = {Transductive Support Vector Machines}, 6497 crossref = {chapelle06semi-supervised}, 6498 pages = {105--118} 6499 } 6500 6501 @article{tong01svm, 6502 author = {Simon Tong and Daphne Koller}, 6503 year = 2001, 6504 title = {Support Vector Machine Active Learning with Applications to 6505 Text Classification}, 6506 journal = {JMLR}, 6507 volume = 2, 6508 pages = {45-66} 6509 } 6510 6511 @inproceedings{baldridge04active, 6512 title = {Active learning and the total cost of annotation}, 6513 author = {Jason Baldridge and Miles Osborne}, 6514 booktitle = {Proc. Empirical Methods in Natural Language Processing}, 6515 pages = {9--16}, 6516 year = {2004}, 6517 abstract = {Active learning (AL) promises to reduce the cost of annotating labeled datasets for trainable human language technologies. Contrary to expectations, when creating labeled training material for HPSG parse selection and later reusing it with other models, gains from AL may be negligible or even negative. This has serious implications for using AL, showing that additional cost-saving strategies may need to be adopted. We explore one such strategy: using a model during annotation to automate some of the decisions. Our best results show an 80% reduction in annotation cost compared with labeling randomly selected data with a single model.} 6518 } 6519 6520 @inproceedings{sindhwani06large, 6521 author = {Sindhwani, V. and Keerthi, S. S.}, 6522 year = 2006, 6523 title = {Large scale semi-supervised linear {SVMs}}, 6524 booktitle = {Proc. SIGIR}, 6525 pages = {477--484} 6526 } 6527 6528 @inproceedings{richardson06beyond, 6529 author = {Richardson, M. and Prakash, A. and Brill, E.}, 6530 year = 2006, 6531 title = {Beyond {P}age{R}ank: machine learning for static ranking}, 6532 booktitle = {Proc. WWW}, 6533 pages = {707--715} 6534 } 6535 6536 @Article{ altingovde08incremental, 6537 author = {Ismail Seng{\"o}r Alting{\"o}vde and 6538 Engin Demir and Fazli Can and {\"O}zg{\"u}r Ulusoy}, 6539 title = {Incremental cluster-based 6540 retrieval using compressed cluster-skipping inverted files}, 6541 year = {2008}, 6542 journal = {TOIS}, 6543 note = {To appear}, 6544 } 6545 6546 @inproceedings{carterette08evaluating, 6547 author = {Ben Carterette and Rosie Jones}, 6548 title = {Evaluating Search Engines by Modeling the Relationship 6549 Between Relevance and Clicks}, 6550 booktitle = {Proc. NIPS}, 6551 year = 2008 6552 } 6553 6554 @inproceedings{metzler05markov, 6555 author = {Donald Metzler and W. Bruce Croft}, 6556 title = {A {M}arkov random field model for term dependencies}, 6557 booktitle = {Proc. SIGIR}, 6558 year = 2005, 6559 pages = {472--479} 6560 } 6561 6562 @book{cord08ml, 6563 author = {Matthieu Cord and P{\'a}draig Cunningham}, 6564 title = {Machine Learning Techniques for Multimedia: Case Studies on 6565 Organization and Retrieval}, 6566 year = 2008, 6567 publisher = {Springer} 6568 } 6569 6570 @inproceedings{schutze06thresholding, 6571 author={Hinrich Sch{\"u}tze and Emre Velipasaoglu and Jan Pedersen}, 6572 title={Performance thresholding in practical text classification}, 6573 booktitle={ACM CIKM}, 6574 year=2006 6575 } 6576 6577 @article{dice45:measures, 6578 author = {L. R. Dice}, 6579 title = {Measures of the amount of ecologic association between species}, 6580 year = 1945, 6581 journal = {Journal of Ecology}, 6582 volume =26, 6583 pages = {297--302} 6584 } 6585 6586 @inproceedings{dhillon02enhanced, 6587 author = {Inderjit S. Dhillon and Subramanyam Mallela and Rahul Kumar}, 6588 title = {Enhanced word clustering for hierarchical text classification}, 6589 booktitle = {KDD '02: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, 6590 year = {2002}, 6591 isbn = {1-58113-567-X}, 6592 pages = {191--200}, 6593 location = {Edmonton, Alberta, Canada}, 6594 doi = {http://doi.acm.org/10.1145/775047.775076}, 6595 publisher = {ACM}, 6596 address = {New York, NY, USA}, 6597 }