Skip to content


BibTeX bibliography

out.bib

@techreport{ANSI-79,
  author = {{American National Standards Institute Inc.}},
  title = {{American National Standard for Writing Abstracts}},
  address = {New York},
  number = {ANSI Z39.14 -- 1979},
  year = 1979,
  institution = {American National Standards Institute}
}
@inproceedings{alemany-EACL-03,
  author = {{Alonso i Alemany}, Laura and {Fuentes Fort}, Maria},
  title = {Integrating cohesion and coherence for Automatic
                  Summarization},
  url = {http://www.aclweb.org/anthology/E/E03/E03-3002.pdf},
  booktitle = {Proceedings of the 11th Meeting of the European
                  Chapter of the Association for Computational
                  Linguistics},
  address = {Budapest, Hungary},
  abstract = {This paper presents the integration of cohesive
                  properties of text with coherence relations, to
                  obtain an adequate representation of text for
                  automatic summarization. A summarizer based on
                  Lexical Chains is enchanced with rhetorical and
                  argumentative structure obtained via Discourse
                  Markers. When evaluated with newspaper corpus, this
                  integration yields only slight improvement in the
                  resulting summaries and cannot beat a dummy baseline
                  consisting of the first sentence in the
                  document. Nevertheless, we argue that this approach
                  relies on basic linguistic mechanisms and is
                  therefore genreindependent},
  month = {April 12 -- 17},
  year = 2003,
  pages = {1 -- 8}
}
@article{baxendale-58,
  author = {Baxendale, Phyllis B.},
  title = {Man-made index for technical literature - an
                  experiment},
  journal = {I.B.M. Journal of Research and Development},
  number = 4,
  volume = 2,
  year = 1958,
  pages = {354 -- 361}
}
@incollection{boguraev-99,
  author = {Boguraev, Branimir and Kennedy, Christopher},
  editor = {Mani, Inderjeet and Maybury, Mark T.},
  publisher = {The MIT Press},
  title = {Salience-based content characterisation of text
                  documents},
  url = {http://www.ling.northwestern.edu/\~{
                  }kennedy/Docs/content-char\_abs.html},
  booktitle = {Advances in Automated Text Summarization},
  abstract = {Traditionally, the document summarisation task has
                  been tackled either as a natural language processing
                  problem, with an instantiated meaning template being
                  rendered into a coherent prose, or as a passage
                  extraction problem, where certain fragments
                  (typically sentences) of the source document are
                  deemed to be highly representative of its content,
                  and thus delivered as meaningful ``approximations''
                  of it. Balancing the conflicting requirements of
                  depth and accuracy of a summary, on the one hand,
                  and document and domain independence, on the other,
                  has proven a very hard problem. This paper describes
                  a novel approach to content characterisation of text
                  documents. It is domain- and genre-independent, by
                  virtue of not requiring an in-depth analysis of the
                  full meaning. At the same time, it remains closer to
                  the core meaning by choosing a different granularity
                  of its representations (phrasal expressions rather
                  than sentences or paragraphs), by exploiting a
                  notion of discourse contiguity and coherence for the
                  purposes of uniform coverage and context
                  maintenance, and by utilising a strong linguistic
                  notion of salience, as a more appropriate and
                  representative measure of a document's
                  ``aboutness''.},
  year = 1999,
  pages = {99 -- 110}
}
@book{borko-75,
  author = {Borko, Harold and Bernier, Charles L.},
  publisher = {Academic Press, London},
  year = 1975,
  title = {Abstracting concepts and methods}
}
@phdthesis{bosma-phd-08,
  author = {Bosma, Wauter Eduard},
  school = {University of Twente},
  title = {Discourse oriented summarization},
  abstract = {The meaning of text appears to be tightly related to
                  intentions and circumstances. Context sensitivity of
                  meaning is addressed by theories of discourse
                  structure. Few attempts have been made to exploit
                  text organization in summarization. This thesis is
                  an exploration of what knowledge of discourse
                  structure can do for content selection as a subtask
                  of automatic summarization, and query-based
                  summarization in particular. Query-based
                  summarization is the task of answering an arbitrary
                  user query or question by using content from
                  potentially relevant sources. This thesis presents a
                  general framework for discourse oriented
                  summarization, relying on graphs to represent
                  semantic relations in discourse, and redundancy as a
                  special type of semantic relation.  Semantic
                  relations occur on several levels of text analysis
                  (query-relevance, coherence, layout, etc.), and a
                  broad range of textual features may be required to
                  detect them. The graph-based framework facilitates
                  combining multiple features into an integrated
                  semantic model of the documents to
                  summarize. Recognizing redundancy and entailment
                  relations between text passages is particularly
                  important when a summary is generated of multiple
                  documents, e.g. to avoid including redundant content
                  in a summary. For this reason, I pay particular
                  attention to recognizing textual entailment. Within
                  this framework, a three-fold evaluation is performed
                  to evaluate different aspects of discourse oriented
                  summarization. The first is a user study, measuring
                  the effect on user appreciation of using a
                  particular type of knowledge for query-based
                  summarization. In this study, three presentation
                  strategies are compared: summarization using the
                  rhetorical structure of the source, a baseline
                  summarization method which uses the layout of the
                  source, and a baseline presentation method which
                  uses no summarization but just a concise answer to
                  the query. Results show that knowledge of the
                  rhetorical structure not only helps to provide the
                  necessary context for the user to verify that the
                  summary addresses the query adequately, but also to
                  increase the amount of relevant content.  The second
                  evaluation is a comparison of implementations of the
                  graph-based framework which are capable of fully
                  automatic summarization. The two variables in the
                  experiment are the set of textual features used to
                  model the source and the algorithm used to search a
                  graph for relevant content. The features are based
                  on cosine similarity, and are realized as graph
                  representations of the source. The graph search
                  algorithms are inspired by existing algorithms in
                  summarization.  The quality of summaries is measured
                  using the Rouge evaluation toolkit.  The best
                  performer would have ranked first (Rouge-2) or
                  second (Rouge-SU4) if it had participated in the DUC
                  2005 query-based summarization challenge. The third
                  study is an evaluation in the context of the DUC
                  2006 summarization challenge, which includes
                  readability measurements as well as various
                  content-based evaluation metrics. The evaluated
                  automatic discourse oriented summarization system is
                  similar to the one described above, but uses
                  additional features, i.e. layout and textual
                  entailment. The system performed well on readability
                  at the cost of content-based scores which were well
                  below the scores of the highest ranking DUC 2006
                  participant. This indicates a trade-off between
                  readable, coherent content and useful content, an
                  issue yet to be explored. Previous research implies
                  that theories of text organization generalize well
                  to multimedia. This suggests that the discourse
                  oriented summarization framework applies to
                  summarizing multimedia as well, provided sufficient
                  knowledge of the organization of the (multimedia)
                  source documents is available. The last study in
                  this thesis is an investigation of the applicability
                  of structural relations in multimedia for generating
                  picture-illustrated summaries, by relating summary
                  content to picture-associated text (i.e. captions or
                  surrounding paragraphs). Results suggest that
                  captions are the more suitable annotation for
                  selecting appropriate pictures. Compared to manual
                  illustration, results of automatic pictures are
                  similar if the manual picture is mainly decorative.},
  year = 2009
}
@article{brandow-IPM-95,
  author = {Brandow, Ronald and Mitze, Karl and Rau, Lisa F.},
  title = {Automatic condensation of electronic publications by
                  sentence selection},
  url = {http://dx.doi.org/10.1016/0306-4573(95)00052-I},
  journal = {Information Processing \& Management},
  number = 5,
  abstract = {As electronic information access becomes the norm,
                  and the variety of retrievable material increases,
                  automatic methods of summarizing or condensing text
                  will become critical. This paper describes a system
                  that performs domain-independent automatic
                  condensation of news from a large commercial news
                  service encompassing 41 different publications.
                  This system was evaluated against a system that
                  condensed the same articles using only the first
                  portion of the texts (the lead), up to the target
                  length of the summaries. Three lengths of articles
                  were evaluated for 250 documents by both systems,
                  totalling 1500 suitability judgements in all. The
                  outcome of perhaps the largest evaluation of human
                  vs machine summarization performed to date was
                  unexpected. The lead-based summaries outperformed
                  the "intelligent" summaries significantly, achieving
                  acceptability ratings of over 90\%, compared to
                  74.4\%. This paper briefly reviews the literature,
                  details the implications of these results, and
                  addresses the remaining hopes for content-based
                  summarization. We expect the results presented here
                  to be useful to other researchers currently
                  investigating the viability of summarization through
                  sentence selection heuristics.},
  volume = 31,
  year = 1995,
  pages = {675 -- 685}
}
@inproceedings{brunn-DUC-01,
  author = {Brunn, Meru and Chali, Yllias and Pinchak,
                  Christopher J.},
  title = {Text Summarization Using Lexical Chains},
  url = {http://www-nlpir.nist.gov/projects/duc/pubs/2001papers/lethbridge.pdf},
  booktitle = {Proceedings of DUC2001 Conference},
  address = {New Orleans, Louisiana, USA},
  month = {September 13 -- 14},
  year = 2001,
  annote = {Available at:
                  http://www-nlpir.nist.gov/projects/duc/pubs/2001papers/lethbridge.pdf},
  abstract = {Text summarization addresses both the problem of
                  selecting the most important portions of text and
                  the problem of generating coherent summaries. We
                  present in this paper the summarizer of the
                  University of Lethbridge at DUC 2001, which is based
                  on an efficient use of lexical chains.}
}
@book{cleveland-83,
  author = {Cleveland, Donald B.},
  publisher = {Libraries Unlimited, Inc},
  year = 1983,
  title = {Introduction to Indexing and Abstracting}
}
@book{collinson-71,
  author = {Collinson, R.},
  publisher = {American Bibliographical Center - Clio Press},
  year = 1971,
  title = {Abstracts and abstracting services}
}
@book{cremmins-96,
  author = {Cremmins, Edward T.},
  edition = {2nd},
  publisher = {Arlington, Va. : Information Resources Press},
  year = 1996,
  title = {The Art of Abstracting}
}
@incollection{dejong-82,
  author = {DeJong, G.},
  editor = {Lehnert, W. G. and Ringle, M. H.},
  publisher = {Hillsdale, NJ: Lawrence Erlbaum},
  title = {An overview of the {FRUMP} system},
  booktitle = {Strategies for natural language processing},
  year = 1982,
  pages = {149 -- 176}
}
@book{dijk-80,
  author = {van Dijk, Teun A.},
  publisher = {London : Longman},
  year = 1980,
  title = {Text and context : explorations in the semantics and
                  pragmatics of discourse}
}
@article{edmundson-ACM-69,
  author = {Edmundson, H. P.},
  title = {New methods in Automatic Extracting},
  url = {http://courses.ischool.berkeley.edu/i256/f06/papers/edmonson69.pdf},
  journal = {Journal of the Association for Computing Machinery},
  number = 2,
  abstract = {This paper describes new methods of automatically
                  extracting documents for screening purposes,
                  i.e. the computer selection of sentences having the
                  greatest potential for conveying to the reader the
                  substance of the document. While previous work has
                  focused on one component of sentence significance,
                  namely, the presence of high-frequency content words
                  (key words), the methods described here also treat
                  three additional components: pragmatic words (cue
                  words); title and heading words; and structural
                  indicators (sentence location).  The research has
                  resulted in an operating system and a research
                  methodology.  The extracting system is parameterized
                  to control and vary the influence of the above four
                  components. The research methodology includes
                  procedures for the compilation of the required
                  dictionaries, the setting of the control parameters,
                  and the comparative evaluation of the automatic
                  extracts with manually produced extracts. The
                  results indicate that the three newly proposed
                  components dominate the frequency component in the
                  production of better extracts.},
  month = {April},
  volume = 16,
  year = 1969,
  pages = {264 -- 285}
}
@phdthesis{elhadad-phd-06,
  author = {Elhadad, Noémie},
  school = {Columbia University},
  title = {User-Sensitive Text Summarization: Application to
                  the Medical Domain},
  url = {http://people.dbmi.columbia.edu/noemie/papers/thesis.pdf},
  abstract = {In this thesis, we present a user-sensitive approach
                  to text summarization.  One domain which would
                  highly benefit from tailoring summaries to both
                  individual and class-based user characteristics is
                  the medical domain, where physicians and patients
                  access similar information, each with their own
                  needs and abilities. Our framework is a medical
                  digital library for physicians and patients. We
                  describe a summarizer, which generates summaries of
                  findings in an input set of clinical studies. When a
                  physician is treating a specific patient, he’s
                  looking for information relevant to the patient’s
                  history and problems. The summarizer takes the
                  user’s interests into account and presents only the
                  findings pertaining to a user model, as approximated
                  by an existing patient record. The same synthesis of
                  information can also be of interest to the
                  patient. The summarizer predicts which medical terms
                  used in a text will be too technical for patients,
                  and augments it with appropriate definitions when
                  necessary.  We adopt a generation-like architecture
                  for our summarizer. However, because our input is
                  textual and not semantic, new challenges arise.  We
                  operate over a content representation hybrid between
                  full-semantic and extracted phrases. Our content
                  organization strategy is dynamic and
                  data-driven. This is in contrast to most summarizers
                  which use no explicit strategies to order
                  information extracted from several input
                  documents. The result is more readable, coherent
                  output. To generate the actual summary, the
                  summarizer makes use of aggregation and phrasal
                  generation. The result is a concise and fluent
                  summary.  One key challenge when it comes to
                  adapting a text for a different audience is
                  identifying the bottleneck for reader comprehension.
                  We analyzed corpora of technical and lay medical
                  texts and qualified differences. We identified the
                  presence of difficult vocabulary as the major
                  obstacle to comprehension for lay readers. We
                  designed an unsupervised method to predict which
                  terms are incomprehensible for lay readers and
                  provide the user with appropriate definitions.  Our
                  methods are grounded on corpus analyses and
                  feasibility studies conducted with physicians and
                  consumers of health information. To assess the value
                  of our work, we evaluated our summarizer both
                  intrinsically and extrinsically. Our task-based
                  evaluation conducted with physicians at the ICU
                  demonstrates that personalized summaries help
                  physicians access relevant information better than
                  generic summaries. Evaluation with lay readers shows
                  that our method to augment technical medical texts
                  improves readers’ comprehension significantly.},
  year = 2006,
  type = {Ph.D. Thesis}
}
@book{endres-niggemeyer-98a,
  author = {Endres-Niggemeyer, Brigitte},
  publisher = {Springer},
  year = 1998,
  title = {Summarizing information}
}
@inproceedings{goldstein-ANLP-00,
  author = {Goldstein, Jade and Mittal, Vibhu O. and Carbonell,
                  Jamie and Kantrowitz, Mark},
  editor = {Hahn, Udo and Lin, Chin-Yew and Mani, Inderjeet and
                  Radev, Dragomir R.},
  title = {{Multi-Document Summarization by Sentence
                  Extraction}},
  booktitle = {Proceedings of the Workshop on Automatic
                  Summarization at the 6th Applied Natural Language
                  Processing Conference and the 1st Conference of the
                  North American Chapter of the Association for
                  Computational Linguistics},
  address = {Seattle, WA},
  month = {April},
  year = 2000,
  abstract = {This paper discusses a text extraction approach to
                  multi-document summarization that builds on
                  single-document summarization methods by using
                  additional, available information about the document
                  set as a whole and the relationships between the
                  documents. Multi-document summarization differs from
                  single in that the issues of compression, speed,
                  redundancy and passage selection are critical in the
                  formation of useful summaries. Our approach
                  addresses these issues by using domain-independent
                  techniques based mainly on fast, statistical
                  processing, a metric for reducing redundancy and
                  maximizing diversity in the selected passages, and a
                  modular framework to allow easy parameterization for
                  different genres, corpora characteristics and user
                  requirements.}
}
@inproceedings{goldstein-SIGIR-99,
  author = {Goldstein, Jade and Kantrowitz, Mark and Mittal,
                  Vibhu and Carbonell, Jaime},
  title = {Summarizing Text Documents: Sentence Selection and
                  Evaluation Metrics},
  url = {http://citeseer.ist.psu.edu/goldstein99summarizing.html},
  booktitle = {Proceedings of the 22nd Annual International ACM
                  SIGIR Conference on Research and Development in
                  Information Retrieval},
  address = {Berkeley, California},
  abstract = {Human-quality text summarization systems are
                  difficult to design, and even more difficult to
                  evaluate, in part because documents can differ along
                  several dimensions, such as length, writing style
                  and lexical usage. Nevertheless, certain cues can
                  often help suggest the selection of sentences for
                  inclusion in a summary. This paper presents our
                  analysis of news-article summaries generated by
                  sentence selection. Sentences are ranked for
                  potential inclusion in the summary using a weighted
                  combination of statistical and linguistic features.
                  The statistical features were adapted from standard
                  IR methods. The potential linguistic ones were
                  derived from an analysis of news-wire summaries. To
                  evaluate these features we use a normalized version
                  of precision-recall curves, with a baseline of
                  random sentence selection, as well as analyze the
                  properties of such a baseline. We illustrate our
                  discussions with empirical results showing the
                  importance of corpus-dependent baseline
                  summarization standards compression ratios and
                  carefully crafted long queries.},
  month = {August, 15 -- 19},
  year = 1999,
  pages = {121 -- 128}
}
@incollection{graetz-85,
  author = {Graetz, Naomi},
  editor = {Ulign, J. M. and Pugh, A. K.},
  publisher = {Leuven: Acco},
  title = {Teaching {EFL} students to extract structural
                  information from abstracts},
  url = {http://www.eric.ed.gov/ERICWebPortal/custom/portlets/recordDetails/detailmini.jsp?_nfpb=true&_&ERICExtSearch_SearchValue_0=ED224327&ERICExtSearch_SearchType_0=no&accno=ED224327},
  booktitle = {Reading for Professional Purposes: Methods and
                  Materials in Teaching Languages},
  abstract = {A brief narrative description of the journal
                  article, document, or resource.The benefits for
                  students of English as a second language of reading
                  abstracts are considered, and the functions and
                  types of abstracts are reviewed. In addition, the
                  results of a survey of Ben Gurion University
                  (Israel) lecturers regarding their reading habits
                  and use of abstracts are briefly addressed. It is
                  suggested that when abstracts are reproduced
                  together with the article, they can be used in the
                  classroom as advanced organizers. For the abstract
                  that follows the structure of the article exactly,
                  two types of activities may be undertaken: asking
                  the student to find and outline the corresponding
                  sections in the article, and forcing the student to
                  read between the subtitles. An example of how to
                  break down the structure of an abstract and relate
                  it to the article is presented: Abstracts can also
                  be used in isolation as cohesive and coherent texts
                  in their own right. For instance, since abstracts
                  are short texts, several abstracts on related topics
                  can be studied in much less time than it would take
                  to read one entire article. In planning the
                  curriculum, it is proposed that abstracts can be
                  used on all levels. For the lower level class, short
                  or indicative types of abstracts can be used. For
                  the intermediate level, longer, informative types
                  are useful, and for the advanced levels, the
                  critical abstract is appropriate.  Appended material
                  includes sample abstracts, information on the
                  organization of the abstract, classifications of
                  introductory and concluding lines, a list of
                  journals with abstracts, an example of an ideal
                  abstract, and results of the faculty attitude
                  questionnaire.},
  year = 1985,
  pages = {123--135}
}
@inproceedings{hasler-CL-03,
  author = {Hasler, Laura and Or\u{a}san, Constantin and Mitkov,
                  Ruslan},
  title = {Building better corpora for summarisation},
  url = {http://clg.wlv.ac.uk/papers/hasler-CL-03.pdf},
  booktitle = {Proceedings of Corpus Linguistics 2003},
  address = {Lancaster, UK},
  month = {March, 28 -- 31},
  year = 2003,
  pages = {309 -- 319}
}
@phdthesis{hasler-phd-07,
  author = {Hasler, Laura},
  school = {University of Wolverhampton, UK},
  title = {From extracts to abstracts: Human summary production
                  operations for computer-aided summarisation},
  url = {http://clg.wlv.ac.uk/papers/hasler-thesis.pdf},
  abstract = {This thesis is concerned with the field of
                  computer-aided summarisation, which has emerged at
                  the confluence of the separate but related fields of
                  human and automatic summarisation. Due to the poor
                  quality of the readability and coherence of
                  automatically produced extracts, computer-aided
                  summarisation (CAS) is a viable working option to
                  fully automatic summarisation. CAS allows a human
                  summariser to post-edit automatically produced
                  extracts to improve their readability and
                  coherence. In order to best utilise the concept of
                  computer-aided summarisation, reliable ways of
                  improving the coherence and readability of extracts
                  when transforming them into abstracts must be
                  established.  To achieve this, a corpus-based
                  analysis of the operations a human summariser
                  applies to extracts to transform them into abstracts
                  is presented. The corpus developed here is a corpus
                  of pairs of news texts annotated for important
                  information (i.e., human-produced extracts) and the
                  human-produced abstracts corresponding to these
                  extracts.  The creation of this corpus simulates the
                  computer-aided summarisation process to enable a
                  reliable investigation into the operations used.  A
                  detailed classification of human summary production
                  operations is proposed, with examples which
                  highlight the common linguistic realisations and
                  functions of the operations identified in the
                  corpus.  The classification is then used as a basis
                  for guidelines which can be given to users of
                  computer-aided summarisation systems in order to
                  ensure that the summaries they produce are of a
                  consistently high quality.  The human summary
                  production operations are applied to extracts using
                  the guidelines in order to evaluate them. Evaluation
                  is performed using a metric developed for Centering
                  Theory, a discourse theory of local coherence and
                  salience, which constitutes a new evaluation
                  method. This is appropriate because existing methods
                  of evaluating summaries are unsuitable. A set of
                  both automatic and human- produced extracts and
                  their corresponding abstracts are evaluated, and a
                  comparison is made with evaluations given by a human
                  judge. The evaluation shows that when the operations
                  are applied to extracts using the guidelines, there
                  is an improvement in the readability and coherence
                  of the resulting abstracts.},
  year = 2007
}
@phdthesis{hirao-phd-02,
  author = {Hirao, Tsutomu},
  school = {Nara Institute of Science and Technology},
  title = {A Study on Generic and User-focused Automatic
                  Summarization},
  url = {http://cl.aist-nara.ac.jp/thesis/dthesis-hirao.pdf},
  abstract = {Due to the rapid growth of the Internet and the
                  emergence of low-price and large-capacity storage
                  devices, the number of online documents is
                  exploding. This situation makes it difficult to find
                  and gather the information we really
                  need. Therefore, many researchers have been studying
                  technologies to overcome this difficulty. Examples
                  include Automatic Summarization, Information
                  Retrieval (IR), Information Extraction (IE), and
                  Question-Answering (QA). In recent years, Automatic
                  Text Summarization has attracted the attention of a
                  lot of researchers in this field. This technology
                  produces overviews that are easier and faster to
                  browse than the original documents.  This thesis
                  discusses the following three topics in automatic
                  text summarization: 1. High performance "generic"
                  single-document summarization with many features
                  (Chapter 2).  2. "Generic" multi-document
                  summarization by extending the single-document
                  summarization method (Chapter 3).  3. "User-focused"
                  summarization as evidence of answer in
                  Question-Answering Systems (Chapter 4).  Chapter 2
                  proposes a method of “generic” single-document
                  summarization based on Support Vector Machines. It
                  is known that integrating heterogeneous sentence
                  features is effective for summarization. However, we
                  cannot manually find optimal parameter values for
                  these features when many features are
                  available. Therefore, machine learning has attracted
                  attention in order to integrate heterogeneous
                  features effectively.  However, most machine
                  learning methods overfit the training data when many
                  features are given. In order to solve this
                  difficulty, we employ Support Vector Machines, which
                  are robust even when the number of features is
                  large. Moreover, we do not know what the effective
                  features are. To address this problem, we analyze
                  the weights of features and clarify them.  Chapter 3
                  proposes a "generic" multi-document summarization
                  method using Support Vector Machines. Multi-document
                  summarization is almost the same as single-document
                  summarization, except that we need to consider extra
                  features for the former. Therefore, we face the same
                  problem as in single-document summarization: how to
                  handle many relevant features. We expand the
                  singledocument summarization method based on Support
                  Vector Machines to multidocument summarization. It
                  is said that a summary from multi-documents has
                  redundancy, i.e., there are redundant
                  sentences. Therefore, we investigate the
                  effectiveness of Maximum Marginal Relevance (MMR)
                  which is one of the generally used methods for
                  minimizing redundancy.  In Chapter 4, we propose a
                  "user-focused" summarization method, Question-
                  Biased Text Summarization (QBTS), which produces
                  evidence of the Question- Answering system’s
                  answer. Question-Answering systems output the exact
                  answer to a question not a document. By using QA
                  systems, we can reduce the time taken to select
                  information. However, QA system’s outputs, i.e.,
                  answers, are not always correct. Therefore, we
                  propose a summarization method which focuses on not
                  only the question, but also on prospective answers
                  to the question to justify the correctness of the QA
                  system’s answer.},
  month = {September},
  year = 2002
}
@incollection{hovy-handbook-03,
  author = {Hovy, Eduard},
  editor = {Mitkov, Ruslan},
  publisher = {Oxford University Press},
  title = {Text summarisation},
  url = {http://www.isi.edu/natural-language/people/hovy/papers/05Handbook-Summ-hovy.pdf},
  booktitle = {The Oxford Handbook of computational linguistics},
  year = 2003,
  pages = {583 -- 598}
}
@inproceedings{jing-SIGIR-99,
  author = {Jing, Hongyan and McKeown, Kathleen R.},
  title = {The Decomposition of Human-Written Summary
                  Sentences},
  url = {http://www.cs.columbia.edu/~hjing/papers/decomposition.ps},
  booktitle = {Proceedings of the 22nd International Conference on
                  Research and Development in Information Retrieval
                  (SIGIR'99)},
  address = {University of Berkeley, CA},
  abstract = {We define the problem of decomposing human-written
                  summary sentences and propose a novel Hidden Markov
                  Model solution to the problem.  Human summarizers
                  often rely on cutting and pasting of the full
                  document to generate summaries. Decomposing a
                  human-written summary sentence requires determining:
                  (1) whether it is constructed by cutting and
                  pasting, (2) what components in the sentence come
                  from the original document, and (3) where in the
                  document the components come from.  Solving the
                  decomposition problem can potentially lead to the
                  automatic acquisition of large corpora for
                  summarization. It also sheds light on the generation
                  of summary text by cutting and pasting. The
                  evaluation shows that the proposed decomposition
                  algorithm performs well.},
  month = {August},
  year = 1999,
  pages = {129 -- 136}
}
@article{johnson-95,
  author = {Johnson, Frances},
  title = {Automatic abstracting research},
  url = {http://www.emeraldinsight.com/10.1108/00242539510102574},
  journal = {Library review},
  number = 8,
  abstract = {The prospect of automatically generating abstracts
                  has attracted researchers for some time, but the
                  promise of superseding the human effort has yet to
                  be realized. Surveys the approaches and techniques
                  developed with the view to showing why this is
                  so. Particular emphasis is placed on the
                  requirements for the production of abstracts, which
                  effectively serve their intended function, to show
                  the ways in which this has hampered research in the
                  past. Suggests that progress of automatic
                  abstracting research may come about via the
                  integration of some of the techniques into
                  computerized information retrieval systems. This
                  will allow researchers to shift the aim from
                  reproducing the conventional benefits of abstracts
                  to accentuating the advantages to users of
                  computerized representation of information in large
                  textual databases.},
  volume = 44,
  year = 1995,
  pages = {28 - 36}
}
@inproceedings{knight-AAAI-00,
  author = {Knight, Kevin and Marcu, Daniel},
  title = {Statistics-Based Summarization -- Step One: Sentence
                  Compression},
  url = {http://www.isi.edu/\~{
                  }marcu/papers/aaai-stat-sum-00.pdf},
  booktitle = {Proceedings of the 17th National Conference on
                  Artificial Intelligence (AAAI)},
  address = {Austin, Texas, USA},
  abstract = {When humans produce summaries of documents, they do
                  not simply extract sentences and concatenate
                  them. Rather, they create new sentences that are
                  grammatical, that cohere with one another, and that
                  capture the most salient pieces of information in
                  the original document.  Given that large collections
                  of text/abstract pairs are available online, it is
                  now possible to envision algorithms that are trained
                  to mimic this process. In this paper, we focus on
                  sentence compression, a simpler version of this
                  larger challenge. We aim to achieve two goals
                  simultaneously: our compressions should be
                  grammatical, and they should retain the most
                  important pieces of information. These two goals can
                  conflict. We devise both noisy-channel and
                  decision-tree approaches to the problem, and we
                  evaluate results against manual compressions and a
                  simple baseline.},
  month = {July 30 -- August 3},
  todopages = {???},
  year = 2000,
  pages = {703 -- 710}
}
@inproceedings{kupiec-SIGIR-95,
  author = {Kupiec, Julian and Pederson, Jan and Chen, Francine},
  title = {A trainable document summarizer},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.41.1161},
  booktitle = {Proceedings of the 18th ACM/SIGIR Annual Conference
                  on Research and Development in Information
                  Retrieval},
  address = {Seattle},
  month = {July 09 -- 13},
  year = 1995,
  pages = {68 -- 73}
}
@book{lancaster-98,
  author = {Lancaster, Frederick W.},
  edition = {2nd},
  publisher = {London: Library Association},
  year = 1998,
  title = {Indexing and abstracting in theory and practice}
}
@inproceedings{lin-ROUGE-03,
  author = {Lin, Chin-Yew and Hovy, Eduard H.},
  title = {Automatic evaluation of summaries using n-gram
                  co-occurrence},
  url = {http://portal.acm.org/citation.cfm?id=1073465},
  booktitle = {Proceedings of 2003 Language Technology Conference
                  (HLT-NAACL 2003)},
  address = {Edmonton, Canada},
  abstract = {Following the recent adoption by the machine
                  translation community of automatic evaluation using
                  the BLEU/NIST scoring process, we conduct an
                  in-depth study of a similar idea for evaluating
                  summaries. The results show that automatic
                  evaluation using unigram co-occurrences between
                  summary pairs correlates surprising well with human
                  evaluations, based on various statistical metrics;
                  while direct application of the BLEU evaluation
                  procedure does not always give good results.},
  month = {May 27 -- June 1},
  year = 2003,
  pages = {71 -- 78}
}
@inproceedings{lin-WAS-04,
  author = {Lin, Chin-Yew},
  title = {{ROUGE: a Package for Automatic Evaluation of
                  Summaries}},
  url = {http://www.aclweb.org/anthology-new/W/W04/W04-1013.pdf},
  booktitle = {Proceedings of the Workshop on Text Summarization
                  Branches Out (WAS 2004)},
  address = {Barcelona, Spain},
  month = {July 25 - 26},
  year = 2004,
  abstract = {ROUGE stands for Recall-Oriented Understudy for
                  Gisting Evaluation.  It includes measures to
                  automatically determine the quality of a summary by
                  comparing it to other (ideal) summaries created by
                  humans.  The measures count the number of
                  overlapping units such as n-gram, word sequences,
                  and word pairs between the computer-generated
                  summary to be evaluated and the ideal summaries
                  created by humans. This paper introduces four
                  different ROUGE measures: ROUGE-N, ROUGE-L, ROUGE-W,
                  and ROUGE-S included in the ROUGE summarization
                  evaluation package and their evaluatio ns. Three of
                  them have been used in the Document Understanding
                  Conference (DUC) 2004, a large-scale summarization
                  evaluation sponsored by NIST.}
}
@article{luhn-IBMJ-58,
  author = {Luhn, H. P.},
  title = {The automatic creation of literature abstracts},
  url = {http://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf},
  journal = {IBM Journal of research and development},
  number = 2,
  abstract = {Excerpts of technical papers and magazine articles
                  that serve the purposes of conventional abstracts
                  have been created entirely by automatic means. In
                  the exploratory research described, the complete
                  text of an article in machine-readable form is
                  scanned by an IBM 704 data-processing machine and
                  analyzed in accordance with a standard
                  program. Statistical information derived from word
                  frequency and distribution is used by the machine to
                  compute a relative measure of significance, first
                  for individual words and then for sentences.
                  Sentences scoring highest in significance are
                  extracted and printed out to become the
                  "auto-abstract."},
  volume = 2,
  year = 1958,
  pages = {159 -- 165}
}
@book{mani-01,
  author = {Mani, Inderjeet},
  publisher = {John Benjamins Publishing Company},
  title = {Automatic Summarization},
  series = {Natural Language Processing},
  year = 2001
}
@book{mani-99,
  editor = {Mani, Inderjeet and Maybury, Mark T.},
  publisher = {MIT Press},
  title = {Advances in automatic text summarisatio},
  url = {http://mitpress.mit.edu/catalog/item/default.asp?ttype=2&tid=3943},
  abstract = {With the rapid growth of the World Wide Web and
                  electronic information services, information is
                  becoming available on-line at an incredible
                  rate. One result is the oft-decried information
                  overload. No one has time to read everything, yet we
                  often have to make critical decisions based on what
                  we are able to assimilate. The technology of
                  automatic text summarization is becoming
                  indispensable for dealing with this problem. Text
                  summarization is the process of distilling the most
                  important information from a source to produce an
                  abridged version for a particular user or task.
                  Until now there has been no state-of-the-art
                  collection of the most important writings in
                  automatic text summarization. This book presents the
                  key developments in the field in an integrated
                  framework and suggests future research areas. The
                  book is organized into six sections: Classical
                  Approaches, Corpus-Based Approaches, Exploiting
                  Discourse Structure, Knowledge-Rich Approaches,
                  Evaluation Methods, and New Summarization Problem
                  Areas},
  year = 1999
}
@inproceedings{mani-AAAI-98,
  author = {Mani, Inderjeet and Bloedorn, Eric},
  publisher = {MIT Press},
  title = {Machine learning of generic and user-focused
                  summarization},
  url = {http://www.aaai.org/Papers/AAAI/1998/AAAI98-116.pdf},
  booktitle = {Proceedings of the Fifthteen National Conference on
                  Artificial Intelligence},
  address = {Madison, Wisconsin},
  abstract = {A key problem in text summarization is finding a
                  salience function which determines what information
                  in the source should be included in the
                  summary. This paper describes the use of machine
                  learning on a training corpus of documents and their
                  abstracts to discover salience functions which
                  describe what combination of features is optimal for
                  a given summarization task. The method addresses
                  both "generic" and user-focused summaries.},
  year = 1998,
  pages = {821 -- 826}
}
@article{mani-IR-99,
  author = {Mani, Inderjeet and Bloedorn, Eric},
  title = {Summarizing Similarities and Differences Among
                  Related Documents},
  journal = {Information Retrieval},
  number = {1-2},
  abstract = {In many modern information retrieval applications, a
                  common problem which arises is the existence of
                  multiple documents covering similar information, as
                  in the case of multiple news stories about an event
                  or a sequence of events. A particular challenge for
                  text summarization is to be able to summarize the
                  similarities and differences in information content
                  among these documents. The approach described here
                  exploits the results of recent progress in
                  information extraction to represent salient units of
                  text and their relationships. By exploiting
                  meaningful relations between units based on an
                  analysis of text cohesion and the context in which
                  the comparison is desired, the summarizer can
                  pinpoint similarities and differences, and align
                  text segments. In evaluation experiments, these
                  techniques for exploiting cohesion relations result
                  in summaries which (i) help users more quickly
                  complete a retrieval task (ii) result in improved
                  alignment accuracy over baselines, and (iii) improve
                  identification of topic-relevant similarities and
                  differences.},
  month = {May},
  volume = 1,
  year = 1999,
  pages = {35-67}
}
@techreport{mani-SUMMAC-98,
  author = {Mani, Inderjeet and Firmin, Therese and House, David
                  and Chrzanowski, Michael and Klein, Gary and
                  Hirshman, Lynette and Sundheim, Beth and Obrst, Leo},
  title = {The {TIPSTER SUMMAC} Text Summarisation Evaluation:
                  Final Report},
  url = {\url{http://www.itl.nist.gov/iaui/894.02/related_projects/tipster_summac/index.html}},
  abstract = {In May 1998, the U.S. government completed the
                  TIPSTER Text Summarization Evaluation (SUMMAC),
                  which was the first large-scale,
                  developer-independent evaluation of automatic text
                  summarization systems. Two main extrinsic evaluation
                  tasks were defined, based on activities typically
                  carried out by information analysts in the
                  U.S. Government. In the adhoc task, the focus was on
                  indicative summaries which were tailored to a
                  particular topic. In the categorization task, the
                  evaluation sought to find out whether a generic
                  summary could effectively present enough information
                  to allow an analyst to quickly and correctly
                  categorize a document. The final, question-answering
                  task involved an intrinsic evaluation where a
                  topic-related summary for a document was evaluated
                  in terms of its "informativeness", namely, the
                  degree to which it contained answers found in the
                  source document to a set of topic-related questions.
                  SUMMAC has established definitively in a large-scale
                  evaluation that automatic text summarization is very
                  effective in relevance assessment tasks. Summaries
                  at relatively low compression rates (17\% for adhoc,
                  10\% for categorization) allowed for relevance
                  assessment almost as accurate as with full-text (5\%
                  degradation in F-score for adhoc and 14\%
                  degradation for categorization, both degradations
                  not being statistically significant), while reducing
                  decision-making time by 40\% (categorization) and
                  50\% (adhoc). In the question-answering task,
                  automatic methods for measuring informativeness of
                  topic-related summaries were introduced; the
                  systems' scores using the automatic methods were
                  found to correlate positively with informativeness
                  scores rendered by human judges. The evaluation
                  methods used in the SUMMAC evaluation are of
                  intrinsic interest to both summarization evaluation
                  as well as evaluation of other "output-related" NLP
                  technologies, where there may be many potentially
                  acceptable outputs, with no automatic way to compare
                  them.},
  number = {MTR 98W0000138},
  year = 1998,
  institution = {The MITRE Corporation}
}
@book{marcu-00,
  author = {Marcu, Daniel},
  publisher = {The MIT Press},
  year = 2000,
  title = {The theory and practice of discourse parsing and
                  summarisation}
}
@inproceedings{marcu-AAAI-99,
  author = {Marcu, Daniel},
  title = {The automatic construction of large-scale corpora
                  for summarization research},
  url = {http://portal.acm.org/citation.cfm?id=312624.312668},
  booktitle = {The 22nd International ACM SIGIR Conference on
                  Research and Development in Information Retrieval
                  (SIGIR'99)},
  address = {Berkeley, CA},
  abstract = {Summarization research is notorious for its lack of
                  adequate corpora: today, there exist only a few
                  small collections of texts whose units have been
                  manually annotated for textual importance. Given the
                  cost and tediousness of the annotation process, it
                  is very unlikely that we will ever manually annotate
                  for textual importance sufficiently large corpora of
                  texts. To circumvent this problem, we have developed
                  an algorithm that constructs such corpora
                  automatically. Our algorithm takes as input an
                  hAbstract, Texti tuple and generates the
                  corresponding Extract, i.e., the set of clauses
                  (sentences) in the Text that were used to write the
                  Abstract. The performance of the algorithm is shown
                  to be close to that of humans by means of an
                  empirical experiment.  The experiment also suggests
                  extraction strategies that could improve the
                  performance of automatic summarization systems.},
  month = {August 15 -- 19},
  year = 1999,
  pages = {137-144}
}
@phdthesis{marcu-phd-97,
  author = {Marcu, Daniel},
  school = {Department of Computer Science, University of
                  Toronto, Toronto, Canada},
  title = {The Rhetorical Parsing, Summarization and Generation
                  of Natural Language Texts},
  url = {http://www.isi.edu/~marcu/papers/phd-thesis.ps.gz},
  year = 1997
}
@inproceedings{miike-SIGIR-94,
  author = {Miike, Seiji and Itoh, Etsuo and Ono, Kenji and
                  Sumita, Kazuo},
  publisher = {ACM/Springer},
  title = {A Full-Text Retrieval System with a Dynamic Abstract
                  Generation Function},
  url = {http://portal.acm.org/citation.cfm?id=188550},
  booktitle = {Proceedings of the 17th ACM SIGIR conference},
  address = {Dublin, Ireland},
  abstract = {We have developed a Japanese full-text retrieval
                  system named BREVIDOC that enables the user to
                  specify an area within a text for abstraction and to
                  control the volume of the abstract
                  interactively. This system analyzes a document
                  structure using linguistic knowledge only and thus
                  is domain-independent. In its text structure
                  analysis, the system determines relations among
                  paragraphs and sentences, based on linguistic clues
                  such as connective, anaphoric expressions, and
                  idiomatic expressions.  The system analyzes and
                  stores the text structure in advance so that it can
                  generate an abstract in real time by selecting
                  sentences according to relative importance of
                  rhetorical relations among the sentences.  The
                  retrieval system works on an engineering
                  workstation.},
  month = {3-6 July},
  year = 1994,
  pages = {152 -- 161}
}
@inproceedings{minel-ISTS-97,
  author = {Minel, Jean-Luc and Nugier, Sylvaine and Piat,
                  Gerald},
  title = {How to appreciate the quality of automatic text
                  summarization?},
  url = {http://www.aclweb.org/anthology-new/W/W97/W97-0705.pdf},
  booktitle = {Proceedings of the ACL'97/EACL'97 Workshop on
                  Intelligent Scallable Text Summarization},
  address = {Madrid, Spain},
  abstract = {For the SERAPHIN project, we set up two assessment
                  protocols in order to be able to more accurately
                  assess the quality of abstracts - the FAN protocol
                  and the MLUCE protocol, for which we provide the
                  results.  The FAN protocol assesses the legibility
                  of an abstract, independently from the source text
                  The MLUCE protocol ls designed to allow users of
                  automatic abstracts to assess their quality. These
                  protocols were applied to a corpus of 27 texts which
                  varied in length from between three and twelve
                  pages. These texts were randomly chosen from EDF
                  archives. They include both scientific and general
                  press articles, extracts from books, and internal
                  EDP notes. The results of the FAN protocol
                  demonstrate the difficulty of using surface
                  linguistic indicators to assess the quality of an
                  abstract, the results of the MLUCE protocol
                  illustrate the importance of user expectations.},
  month = {July 11},
  year = 1997,
  pages = {25 -- 30}
}
@book{moens-00,
  author = {Moens, Marie-Francine},
  publisher = {Kluwer Academic Publishers},
  year = 2000,
  title = {Automatic Indexing and Abstracting of Document
                  Texts}
}
@article{morris-ISR-92,
  author = {Morris, Andrew H. and Kasper, George M. and Adams,
                  Dennis A.},
  title = {The effect and limitations of automatic text
                  condensing on reading comprehension performance},
  journal = {Information Systems Research},
  number = 1,
  volume = 3,
  year = 1992,
  pages = {17 -- 35}
}
@phdthesis{nomoto-phd-04,
  author = {Nomoto, Tadashi},
  school = {Nara Institute of Science and Technology},
  title = {Machine Learning Approaches in Rhetorical Parsing
                  and Open-domain Text Summarization},
  url = {http://cl.aist-nara.ac.jp/thesis/dthesis-nomoto.pdf},
  abstract = {The present thesis primarily concerns the use of
                  machine learning for rhetorical parsing and
                  open-domain text summarization. Chapter 1 sets a
                  general backdrop on text summarization and its
                  subfield, rhetorical parsing, and defines the area
                  of investigation. Chapters 2 through 9 form the core
                  of the thesis, developing each theme in great depth,
                  for which we will give a brief overview
                  below. (Throughout the thesis, we talk about
                  extractive summarization, meaning that we create a
                  summary by putting together bits and pieces,
                  usually, sentences extracted from text.)  In
                  chapters 2 through 5, we motivate and develop a
                  novel approach to rhetorical parsing based on the
                  decision tree (DT) learning, which one could adapt
                  for any genre and language given a training corpus.
                  (Unless stated otherwise, DT here and below means
                  Quinlan’s C4.5 with default settings.) An important
                  goal of rhetorical parsing is to recover rhetorical
                  structure of text for potential use with text
                  summarization. Performance of our approach is
                  evaluated using an hand-annotated corpus of Japanese
                  newspaper articles. Also some problems with
                  annotating with rhetorical information such as the
                  variability of human judgments on labeling are noted
                  and discussed.  In addition some refinements are
                  made on the DT learning itself by appeal to the
                  minimum description length principle (MDL) and
                  active learning. Evaluation is done using the same
                  data as above. We also look into how a DT harnessed
                  with MDL (DT/MDL), compares in performance with
                  AdaBoosted DT.  Due to poor results with the
                  linguistically motivated paradigm that previous
                  chapters represent, we turn an eye on non-linguistic
                  approaches to summarization. Chapter 6 explores an
                  unsupervised paradigm for text summarization. We
                  develop there what we call the diversity based
                  summarization or DBS, which consists in the K-means
                  clustering (again extended with MDL) and a simple
                  sentence ranking scheme. A new evaluative scheme for
                  summarization (which we call the information-centric
                  approach to evaluation of summaries, or ICE) is also
                  proposed with an eye to providing an objective
                  assessment of the utility of machine generated
                  summaries. Evaluation is conducted using a publicly
                  available corpus known as BMIR-J2.  Then we proceed
                  to the issue of modeling human created summaries in
                  the DBS paradigm. We compare performance of DBS and
                  DT- (and DT/MDL-) based summarizers trained on a
                  human-annotated corpus. Curiously enough, it is
                  found that DBS closely rivals and sometimes
                  outperforms DT- and DT/MDL- based summarizers –
                  which we collectively call ‘DT(/MDL)’ here – when
                  tested on those annotations which judges tend to
                  disagree on, but falls behind DT(/MDL) on
                  annotations for which there is a strong agreement
                  among judges. The result suggests that there are
                  some useful, i.e., DT-learnable, patterns in
                  annotations for which people have a more or less
                  same idea about what they should be like.  While
                  DT(/MDL) is apparently able to exploit patterns to
                  its advantage, DBS, being unsupervised, is not able
                  to perform as well as when it is run on annotations
                  with varying judgments. Which however points to an
                  integration of DT(/MDL) with DBS as a possible
                  alternative to DBS as the combine should then be
                  able to take into account the regularity as well as
                  variability of human summaries, an issue that
                  engages us in subsequent chapters, where we consider
                  other variations of DT. We argue that taking into
                  account both properties indeed leads to a better
                  performing summarizer.  Finally, we look at curious
                  regularities in the way people vote for summary
                  sentences when asked to pick up those they consider
                  important or summary-worthy. Texts from a news wire
                  domain typically show that initially occurring
                  sentences are popularly voted or preferred for
                  summary sentences while those occurring later in
                  text decidedly get less popular. Texts from a column
                  domain, on the other hand, exhibit a somewhat
                  different pattern, showing that sentences occurring
                  towards the end are as much favored by people as
                  those occurring text-initially.  We argue that the
                  distribution of votes for summary sentences, which
                  we call ‘DOV,’ has some shape specific to a domain,
                  and propose a particular approach that directly
                  exploits DOVs by way of Bayesian modeling. We show
                  that the Bayesian model provides a significant
                  leverage over approaches based on pattern
                  classifiers such as C4.5, Adtree, Kstar, Naive
                  Bayes, etc.},
  month = {December},
  year = 2004
}
@inproceedings{oka-NAACL-00,
  author = {Oka, Mamiko and Ueda, Yoshihiro},
  title = {Evaluation of Phrase-Representation Summarization
                  Based on Information Retrieval Task},
  url = {http://www.aclweb.org/anthology/W/W00/W00-0407.pdf},
  booktitle = {NAACL-ANLP 2000 Workshop on Automatic Summarization},
  address = {Seattle, Washington},
  abstract = {We have developed an improved task-based evaluation
                  method of summarization, the accuracy of which is
                  increased by specifying the details of the task
                  including background stories, and by assigning ten
                  subjects per summary sample. The method also serves
                  precision/recall pairs for a variety of situations
                  by introducing multiple levels of relevance
                  assessment. The method is applied to prove
                  phrase-represented summary is most effective to
                  select relevant documents from information retrieval
                  results.},
  month = {April 30},
  year = 2000,
  pages = {59 -- 68}
}
@inproceedings{okumura-TS-03,
  author = {Okumura, Manabu and Fukusima, Takahiro and Nanba,
                  Hidetsugu},
  title = {{Text Summarization Challenge 2: Text Summarization
                  Evaluation at NTCIR Workshop 3}},
  url = {http://acl.ldc.upenn.edu/W/W03/W03-0507.pdf},
  booktitle = {Proceeding of the HLT-NAACL 2003 Workshop on Text
                  Summarization},
  address = {Edmonton, Alberta, Canada},
  abstract = {We describe the outline of Text Summarization
                  Challenge 2 (TSC2 hereafter), a sequel text
                  summarization evaluation conducted as one of the
                  tasks at the NTCIR Workshop 3. First, we describe
                  briefly the previous evaluation, Text Summarization
                  Challenge (TSC1) as introduction to TSC2. Then we
                  explain TSC2 including the participants, the two
                  tasks in TSC2, data used, evaluation methods for
                  each task, and brief report on the results.},
  month = {May 31 -- June 1},
  year = 2003,
  pages = {49 -- 56}
}
@phdthesis{orasan-PHD-06,
  author = {Or\u{a}san, Constantin},
  school = {University of Wolverhampton},
  title = {Comparative evaluation of modular automatic
                  summarisation systems using {CAST}},
  url = {http://www.wlv.ac.uk/~in6093/papers/PhD/Thesis.pdf},
  abstract = {The information overload faced by today's society
                  poses great challenges to researchers who want to
                  find a relevant piece of information.  Automatic
                  summarisation is a field of computational
                  linguistics which can help humans to deal with this
                  information overload by automatically extracting the
                  gist of documents.  This thesis attempts to gain
                  insights into the automatic summarisation field from
                  several different angles. First, it performs
                  qualitative, quantitative and comparative
                  evaluations of different automatic summarisation
                  methods. These summarisation methods are built
                  around a term-based summariser which is then
                  augmented with additional linguistic information
                  which includes lexical, semantic and discourse
                  information. On the basis of these evaluations, it
                  was noticed that the choice of modules which provide
                  low-level linguistic information (e.g. morphological
                  processors) does not influence the results
                  significantly, but higher level linguistic
                  information, such as anaphora resolution and shallow
                  information about discourse structure, leads to
                  significant improvements of the summaries.  In order
                  to have a comprehensive view of how good summaries
                  produced by a given method are, the evaluation
                  performed in this thesis measures both the
                  informativeness of the summaries produced and the
                  quality of their discourse structure. Moreover, a
                  method which determines the upper limit for
                  informativeness is proposed to demonstrate the
                  limits of extraction techniques. Comparison between
                  the informativeness and the quality of discourse
                  reveals no correlation between them.  A third
                  direction pursued in this research is to replace
                  conventional iterative extraction methods, which
                  extract one sentence at a time without considering
                  the rest of the sentences in the summary, with more
                  holistic ones, where the decision to extract a
                  sentence is determined not only by the content of a
                  sentence, but also by the rest of the sentences
                  extracted. To this end, a genetic algorithm which
                  encodes the whole summary is implemented and is
                  shown to produce better summaries than its iterative
                  equivalent.},
  year = 2006
}
@inproceedings{orasan-RANLP-07,
  author = {Or\u{a}san, Constantin and Hasler, Laura},
  title = {Computer-Aided Summarisation: how much does it
                  really help?},
  url = {http://clg.wlv.ac.uk/papers/orasan-hasler-RANLP-07.pdf},
  booktitle = {Proceedings of Recent Advances in Natural Language
                  Processing (RANLP 2007)},
  address = {Borovets, Bulgaria},
  abstract = {Computer-aided summarisation is a technology
                  developed as a complement to automatic
                  summarisation, which produces high quality summaries
                  with less effort. To achieve this, a user-friendly
                  environment which incorporates several well-known
                  summarisation methods has been developed.  This
                  paper presents the main features of the
                  computer-aided summarisation environment and
                  evaluates the usefulness of the developed
                  tool. Experiments showed that it is possible to
                  reduce the time necessary to produce the summary by
                  about 20% without any degradation in the summary's
                  quality.},
  month = {September 27-29},
  year = 2007,
  pages = {437 -- 441}
}
@inproceedings{osborne-ACL-02,
  author = {Osborne, Miles},
  title = {Using maximum entropy for sentence extraction},
  url = {http://portal.acm.org/citation.cfm?id=1118163},
  booktitle = {Proceedings of ACL 2002 Workshop on Automatic
                  Summarization},
  address = {Philadelphia, Pennsylvania},
  abstract = {A maximum entropy classifier can be used to extract
                  sentences from documents. Experiments using
                  technical documents show that such a classifier
                  tends to treat features in a categorical
                  manner. This results in performance that is worse
                  than when extracting sentences using a naive Bayes
                  classifier. Addition of an optimised prior to the
                  maximum entropy classifier improves performance over
                  and above that of naive Bayes (even when naive Bayes
                  is also extended with a similar prior). Further
                  experiments show that, should we have at our
                  disposal extremely informative features, then
                  maximum entropy is able to yield excellent
                  results. Naive Bayes, in contrast, cannot exploit
                  these features and so fundamentally limits sentence
                  extraction performance.},
  month = {July},
  year = 2002,
  pages = {1 -- 8}
}
@incollection{paice-81,
  author = {Paice, Chris D.},
  editor = {Oddy, R. N. and Rijsbergen, C. J. and Williams,
                  P. W.},
  publisher = {London: Butterworths},
  title = {The automatic generation of literature abstracts: an
                  approach based on the identification of
                  self-indicating phrases},
  url = {http://portal.acm.org/citation.cfm?id=636680},
  booktitle = {Information Retrieval Research},
  address = {Kent, UK},
  year = 1981,
  pages = {172 -- 191}
}
@inproceedings{radev-LREC-04,
  author = {Radev, Dragomir and Otterbacher, Jahna and Zhang,
                  Zhu},
  title = {{CSTBank: A Corpus for the Study of Cross-document
                  Structural Relationship}},
  url = {\url{http://clair.si.umich.edu/~radev/papers/lrec-cst04.pdf}},
  booktitle = {Proceedings of Language Resources and Evaluation
                  Conference (LREC 2004)},
  address = {Lisbon, Portugal},
  year = 2004,
  abstract = {Clusters of multiple news stories related to the
                  same topic exhibit a number of interesting
                  properties. For example, when documents have been
                  published at various points in time or by different
                  authors or news agencies, one finds many instances
                  of paraphrasing,information overlap and even
                  contradiction. The current paper presents the
                  Cross-document Structure Theory (CST) Bank, a
                  collection of multi-document clusters in which pairs
                  of sentences from different documents have been
                  annotated for cross-document structure theory
                  relationships. We will describe how we built the
                  corpus, including our method for reducing the number
                  of sentence pairs to be annotated by our hired
                  judges, using lexical similarity measures. Finally,
                  we will describe how CST and the CST Bank can be
                  applied to different research areas such as
                  multi-document summarization.}
}
@inproceedings{radev-NAACL-00,
  author = {Radev, Dragomir R. and Jing, Hongyan and
                  Budzikowska, Malgorzata},
  title = {Centroid-based summarization of multiple documents:
                  sentence extraction, utility-based evaluation and
                  user studies},
  url = {http://aclweb.org/anthology-new/W/W00/W00-0403.pdf},
  booktitle = {Proceedings of the NAACL/ANLP Workshop on Automatic
                  Summarization},
  address = {Seattle, WA, USA},
  abstract = {We present a multi-document summarizer, called MEAD,
                  which generates summaries using cluster centroids
                  produced by a topic detection and tracking
                  system. We also describe two new techniques, based
                  on sentence utility and subsumption, which we have
                  applied to the evaluation of both single and
                  multiple document summaries. Finally, we describe
                  two user studies that test our models of
                  multi-document summarization.},
  month = {30 April},
  year = 2000,
  pages = {21 -- 29}
}
@inproceedings{saggion-NAACL-00,
  author = {Saggion, Horacio and Lapalme, Guy},
  title = {Concept Identification and Presentation in the
                  Context of Technical Text Summarization},
  url = {http://aclweb.org/anthology-new/W/W00/W00-0401.pdf},
  booktitle = {NAACL-ANLP 2000 Workshop on Automatic Summarization},
  address = {Seattle, Washington},
  abstract = {We describe a method of text summarization that
                  produces indicative-informative abstracts for
                  technical papers. The abstracts are generated by a
                  process of conceptual identification, topic
                  extraction and re-generation.  We have carried out
                  an evaluation to assess indicativeness and text
                  acceptability relying on human judgment. The results
                  so far indicate good performance in both tasks when
                  compared with other summarization technologies.},
  month = {April 30},
  year = 2000,
  pages = {1 -- 10}
}
@article{salton-IPM-97,
  author = {Salton, Gerard and Singhal, Amit and Mitra, Mandar
                  and Buckley, Chris},
  title = {Automatic text structuring and summarization},
  url = {http://dx.doi.org/10.1016/S0306-4573(96)00062-3},
  journal = {Information Processing and Management},
  number = 3,
  abstract = {In recent years, information retrieval techniques
                  have been used for automatic generation of semantic
                  hypertext links. This study applies the ideas from
                  the automatic link generation research to attack
                  another important problem in text processing
                  automatic text summarization.  An automatic "general
                  purpose" text summarization tool would be of immense
                  utility in this age of information overload. Using
                  the techniques used (by most automatic hypertext
                  link generation algorithms) for inter-document link
                  generation, we generate intra-document links between
                  passages of a document. Based on the intra-document
                  linkage pattern of a text, we characterize the
                  structure of the text. We apply the knowledge of
                  text structure to do automatic text summarization by
                  passage extraction. We evaluate a set of fifty
                  summaries generated using our techniques by
                  comparing them to paragraph extracts constructed by
                  humans. The automatic summarization methods perform
                  well, especially in view of the fact that the
                  summaries generated by two humans for the same
                  article are surprisingly dissimilar.},
  volume = 33,
  year = 1997,
  pages = {193 -- 207}
}
@article{silber-CL-02,
  author = {Silber, H. Gregory and McCoy, Kathleen F.},
  title = {Efficiently Computed Lexical Chains As an
                  Intermediate Representation for Automatic Text
                  Summarization},
  url = {http://www.aclweb.org/anthology/J02-4004.pdf},
  journal = {Computational linguistics},
  number = 4,
  abstract = {While automatic text summarization is an area that
                  has received a great deal of attention in recent
                  research, the problem of efficiency in this task has
                  not been frequently addressed. When the size and
                  quantity of documents available on the Internet and
                  from other sources are considered, the need for a
                  highly efficient tool that produces usable summaries
                  is clear. We present a linear-time algorithm for
                  lexical chain computation. The algorithm makes
                  lexical chains a computationally feasible candidate
                  as an intermediate representation for automatic text
                  summarization. A method for evaluating lexical
                  chains as an intermediate step in summarization is
                  also presented and carried out. Such an evaluation
                  was heretofore not possible because of the
                  computational complexity of previous lexical chains
                  algorithms.},
  volume = 28,
  year = 2002,
  pages = {487 -- 496}
}
@inproceedings{silber-IUI-00,
  author = {Silber, H. Gregory and McCoy, Kathleen F.},
  title = {Efficient text summarization using lexical chains},
  url = {http://web.media.mit.edu/~lieber/IUI/Silber/Silber.pdf},
  abstract = {The rapid growth of the Internet has resulted in
                  enormous amounts of information that has become more
                  difficult to access efficiently.  Internet users
                  require tools to help manage this vast quantity of
                  information. The primary goal of this research is to
                  create an efficient and effective tool that is able
                  to summarize large documents quickly.  This research
                  presents a linear time algorithm for calculating
                  lexical chains which is a method of capturing the
                  "aboutness" of a document.  This method is compared
                  to previous, less efficient methods of lexical chain
                  extraction.  We also provide alternative methods for
                  extracting and scoring lexical chains. We show that
                  our method provides similar results to previous
                  research, but is substantially more efficient. This
                  efficiency is necessary in Internet search
                  applications where many large documents may need to
                  be summarized at once, and where the response time
                  to the end user is extremely important.},
  address = {New Orleans, Louisiana, United States},
  pages = {252--255},
  year = 2000,
  booktitle = {Proceedings of the 5th International Conference on
                  Intelligent User Interfaces}
}
@incollection{sparck-jones-99,
  author = {{Sparck Jones}, Karen},
  editor = {Mani, Inderjeet and Maybury, Mark T.},
  chapter = 1,
  publisher = {The MIT Press},
  title = {Automatic summarizing: factors and directions},
  booktitle = {Advances in automatic text summarization},
  abstract = {This position paper suggests that progress with
                  automatic summarising demands a better research
                  methodology and a carefully focussed research
                  strategy. In order to develop e ective procedures it
                  is necessary to identify and respond to the context
                  factors, i.e. input, purpose, and output factors,
                  that bear on summarising and its evaluation.  The
                  paper analyses and illustrates these factors and
                  their implications for evaluation. It then argues
                  that this analysis, together with the state of the
                  art and the intrinsic dffculty of summarising, imply
                  a nearer-term strategy concentrating on shallow, but
                  not surface, text analysis and on indicative
                  summarising. This is illustrated with current work,
                  from which a potentially productive research
                  programme can be developed.},
  year = 1999,
  pages = {1 -- 12}
}
@article{sparck-jones-IPM-07,
  author = {{Sparck Jones}, Karen},
  title = {Automatic summarising: The state of the art},
  url = {http://dx.doi.org/10.1016/j.ipm.2007.03.009},
  journal = {Information Processing and Management},
  abstract = {This paper reviews research on automatic summarising
                  in the last decade.  This work has grown, stimulated
                  by technology and by evaluation programmes.  The
                  paper uses several frameworks to organise the
                  review, for summarising itself, for the factors
                  affecting summarising, for systems, and for
                  evaluation.  The review examines the evaluation
                  strategies applied to summarising, the issues they
                  raise, and the major programmes. It considers the
                  input, purpose and output factors investigated in
                  recent summarising research, and discusses the
                  classes of strategy, extractive and non-extractive,
                  that have been explored, illustrating the range of
                  systems built.  The conclusions drawn are that
                  automatic summarisation has made valuable progress,
                  with useful applications, better evaluation, and
                  more task understanding. But summarising systems are
                  still poorly motivated in relation to the factors
                  affecting them, and evaluation needs taking much
                  further to engage with the purposes summaries are
                  intended to serve and the contexts in which they are
                  used.},
  volume = 43,
  year = 2007,
  pages = {1449 -- 1481}
}
@article{teufel-CL-02,
  author = {Teufel, Simone and Moens, Marc},
  title = {Summarizing Scientific Articles: Experiments with
                  Relevance and Rhetorical Status},
  url = {http://www.aclweb.org/anthology/J/J02/J02-4002.pdf},
  journal = {Computational linguistics},
  number = 4,
  abstract = {In this article we propose a strategy for the
                  summarization of scientific articles that
                  concentrates on the rhetorical status of statements
                  in an article: Material for summaries is selected in
                  such a way that summaries can highlight the new
                  contribution of the source article and situate it
                  with respect to earlier work.  We provide a gold
                  standard for summaries of this kind consisting of a
                  substantial corpus of conference articles in
                  computational linguistics annotated with human
                  judgments of the rhetorical status and relevance of
                  each sentence in the articles. We present several
                  experiments measuring our judges' agreement on these
                  annotations.  We also present an algorithm that, on
                  the basis of the annotated training material,
                  selects content from unseen articles and classifies
                  it into a fixed set of seven rhetorical
                  categories. The output of this extraction and
                  classification system can be viewed as a
                  single-document},
  volume = 28,
  year = 2002,
  pages = {409 -- 445}
}
@inproceedings{teufel-ISTS-97,
  author = {Teufel, Simone and Moens, Marc},
  title = {Sentence extraction as a classification task},
  url = {http://www.aclweb.org/anthology/W/W97/W97-0710.pdf},
  booktitle = {Proceedings of the ACL'97/EACL'97 Workshop on
                  Intelligent Scallable Text Summarization},
  address = {Madrid, Spain},
  abstract = {A useful first step in document summarisation is the
                  selection of a small number of `meaningful'
                  sentences from a larger text. Kupiec et al. (1995)
                  describe this as a classification task: on the basis
                  of a corpus of technical papers with summaries
                  written by professional abstractors, their system
                  identifies those sentences in the text which also
                  occur in the summary, and then acquires a model of
                  the `abstract-worthiness' of a sentence as a
                  combination of a limited number of properties of
                  that sentence. We report on a replication of this
                  experiment with different data: summaries for our
                  documents were not written by professional
                  abstractors, but by the authors themselves. This
                  produced fewer alignable sentences to train on.  We
                  use alternative `meaningful' sentences (selected by
                  a human judge) as training and evaluation material,
                  because this has advantages for the subsequent
                  automatic generation of more flexible abstracts.  We
                  quantitatively compare the two different strategies
                  for training and evaluation (vi\ ahgnment vs human
                  judgement), we also discusses qualitative
                  differences and consequences for the generatlon of
                  abstracts.},
  month = {July 11},
  year = 1997,
  pages = {58 -- 59}
}
@phdthesis{teufel-phd-99,
  author = {Teufel, Simone},
  school = {University of Edinburgh},
  title = {{Argumentative Zoning: Information Extraction from
                  Scientific Text}},
  url = {http://www.cl.cam.ac.uk/users/sht25/az.html},
  abstract = {We present a new type of analysis for scientific
                  text which we call Argumentative Zoning.  We
                  demonstrate that this type of text analysis can be
                  used for generating user-tailored and task-tailored
                  summaries and for performing more informative
                  citation analyses.  We also demonstrate that our
                  type of analysis can be applied to unrestricted
                  text, both automatically and by humans. The corpus
                  we use for the analysis (80 conference papers in
                  computational linguistics) is a difficult test bed;
                  it shows great variation with respect to subdomain,
                  writing style, register and linguistic
                  expression. We present reliability studies which we
                  performed on this corpus and for which we use two
                  unrelated trained annotators.  The definition of our
                  seven categories (argumentative zones) is not
                  specific to the domain, only to the text type; it is
                  based on the typical argumentation to be found in
                  scientific articles. It reflects the attribution of
                  intellectual ownership in scientific articles,
                  expressions of authors' stance towards other work,
                  and typical statements about problem-solving
                  processes.  On the basis of sentential features, we
                  use two statistical models (a Naive Bayesian model
                  and an ngram model operating over sentences) to
                  estimate a sentence's argumentative status, taking
                  the hand-annotated corpus as training material. An
                  alternative, symbolic system uses the features in a
                  rule-based way.  The general working hypothesis of
                  this thesis is that empirical discourse studies can
                  contribute to practical document management
                  problems: the analysis of a significant amount of
                  naturally occurring text is essential for discourse
                  linguistic theories, and the application of a robust
                  discourse and argumentation analysis can make text
                  understanding techniques for practical document
                  management more robust.},
  year = 1999
}
@phdthesis{tucker-phd-99,
  author = {Tucker, Richard},
  school = {University of Cambridge},
  title = {Automatic summarising and the {CLASP} system},
  url = {http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-484.html},
  abstract = {This dissertation discusses summarisers and
                  summarising in general, and presents CLASP, a new
                  summarising system that uses a shallow semantic
                  representation of the source text called a
                  "predication cohesion graph".  Nodes in the graph
                  are "simple predications" corresponding to events,
                  states and entities mentioned in the text; edges
                  indicate related or similar nodes. Summary content
                  is chosen by selecting some of these predications
                  according to criteria of "importance",
                  "representativeness" and "cohesiveness". These
                  criteria are expressed as functions on the nodes of
                  a weighted graph. Summary text is produced either by
                  extracting whole sentences from the source text, or
                  by generating short, indicative "summary phrases"
                  from the selected predications.  CLASP uses
                  linguistic processing but no domain knowledge, and
                  therefore does not restrict the subject matter of
                  the source text. It is intended to deal robustly
                  with complex texts that it cannot analyse completely
                  accurately or in full. Experiments in summarising
                  stories from the Wall Street Journal suggest there
                  may be a benefit in identifying important material
                  in a semantic representation rather than a surface
                  one, but that, despite the robustness of the source
                  representation, inaccuracies in CLASP’s linguistic
                  analysis can dramatically affect the readability of
                  its summaries. I discuss ways in which this and
                  other problems might be overcome.},
  year = 1999
}

This file was generated by bibtex2html 1.94.


0 Responses

Stay in touch with the conversation, subscribe to the RSS feed for comments on this post.



Some HTML is OK

or, reply to this post via trackback.



Easy AdSense by Unreal
Creative Commons License
This work is licenced under a Creative Commons Licence.