pubs_journals.bib

@article{manco17,
  author = {Giuseppe Manco and Ettore Ritacco and Pasquale Rullo
                  and Lorenzo Gallucci and Will Astill and Dianne
                  Kimber and Marco Antonelli},
  title = {Fault detection and explanation through big data
                  analysis on sensor streams},
  journal = {Expert Systems With Applications},
  year = 2017,
  url = {http://communications.elsevier.com/r/?id=h454a7a34,185d8b0b,185de8d2&p1=authors.elsevier.com/a/1VDsN3PiGT3sWA},
  volume = 87,
  pages = {141-156},
  keywords = {Fault detection; Anomaly detection; Outlier
                  explanation; Big data; Sensor data},
  abstract = {Fault prediction is an important topic for the
                  industry as, by providing effective methods for
                  predictive maintenance, allows companies to perform
                  important time and cost savings. In this paper we
                  describe an application developed to predict and
                  explain door failures on metro trains. To this end,
                  the aim was twofold: first, devising prediction
                  techniques capable of early detecting door failures
                  from diagnostic data; second, describing failures in
                  terms of properties distinguishing them from normal
                  behavior. Data pre-processing was a complex task
                  aimed at overcoming a number of issues with the
                  dataset, like size, sparsity, bias, burst effect and
                  trust. Since failure premonitory signals did not
                  share common patterns, but were only characterized
                  as non-normal device signals, fault prediction was
                  performed by using outlier detection. Fault
                  explanation was finally achieved by exhibiting
                  device features showing abnormal values. An
                  experimental evaluation was performed to assess the
                  quality of the proposed approach. Results show that
                  high-degree outliers are effective indicators of
                  incipient failures. Also, explanation in terms of
                  abnormal feature values (responsible for
                  outlierness) seems to be quite expressive.There are
                  some aspects in the proposed approach that deserve
                  particular attention. We introduce a general
                  framework for the failure detection problem based on
                  an abstract model of diagnostic data, along with a
                  formal problem statement. They both provide the
                  basis for the definition of an effective data
                  pre-processing technique where the behavior of a
                  device, in a given time frame, is summarized through
                  a number of suitable statistics. This approach
                  strongly mitigates the issues related to data
                  errors/noise, thus enabling to perform an effective
                  outlier detection. All this, in our view, provides
                  the grounds of a general methodology for advanced
                  prognostic systems.}
}

@article{AngiulliFMP17,
  author = {Fabrizio Angiulli and Fabio Fassetti and Giuseppe
                  Manco and Luigi Palopoli},
  title = {Outlying property detection with numerical
                  attributes},
  journal = {Data Min. Knowl. Discov.},
  volume = {31},
  number = {1},
  pages = {134--163},
  year = {2017},
  keywords = {Outlier detection Outlying properties Kernel density
                  estimation Clustering },
  abstract = {The outlying property detection problem (OPDP) is
                  the problem of discovering the properties
                  distinguishing a given object, known in advance to
                  be an outlier in a database, from the other database
                  objects. This problem has been recently analyzed
                  focusing on categorical attributes only. However,
                  numerical attributes are very relevant and widely
                  used in databases. Therefore, in this paper, we
                  analyze the OPDP within a context where also
                  numerical attributes are taken into account, which
                  represents a relevant case left open in the
                  literature. As major contributions, we present an
                  efficient parameter-free algorithm to compute the
                  measure of object exceptionality we introduce, and
                  propose a unified framework for mining exceptional
                  properties in the presence of both categorical and
                  numerical attributes.},
  url = {https://www.dropbox.com/s/mfy4vuzi9u7ltt6/dami_final.pdf?dl=0},
  doi = {10.1007/s10618-016-0458-x}
}

@article{Barbieri:2016,
  author = {Barbieri, Nicola and Bonchi, Francesco and Manco,
                  Giuseppe},
  title = {Efficient Methods for Influence-Based
                  Network-Oblivious Community Detection},
  journal = {ACM Trans. Intell. Syst. Technol.},
  issue_date = {2017},
  volume = {8},
  number = {2},
  url = {https://www.dropbox.com/s/x730nf3ljtxj534/TIST-cwn-final_uploaded.pdf?dl=0},
  year = {2016},
  issn = {2157-6904},
  pages = {32:1--32:31},
  articleno = {32},
  numpages = {31},
  url = {http://doi.acm.org/10.1145/2979682},
  doi = {10.1145/2979682},
  acmid = {2979682},
  publisher = {ACM},
  address = {New York, NY, USA},
  keywords = {Social influence, information diffusion,
                  network-oblivious community detection, social
                  network analysis},
  abstract = {The study of influence-driven propagations in social
                  networks and its exploitation for viral marketing
                  purposes has recently received a large deal of
                  attention. However, regardless of the fact that
                  users authoritativeness, expertise, trust and
                  influence are evidently topic-dependent, the
                  research on social influence has surprisingly
                  largely overlooked this aspect. In this article, we
                  study social influence from a topic modeling
                  perspective. We introduce novel topic-aware
                  influence-driven propagation models that, as we show
                  in our experiments, are more accurate in describing
                  real-world cascades than the standard (i.e.,
                  topic-blind) propagation models studied in the
                  literature. In particular, we first propose simple
                  We study the problem of detecting social communities
                  when the social graph is not available but instead
                  we have access to a log of user activity, that is, a
                  dataset of tuples (u, i, t) recording the fact that
                  user u “adopted” item i at time t. We propose a
                  stochastic framework that assumes that the adoption
                  of items is governed by an underlying diffusion
                  process over the unobserved social network and that
                  such a diffusion model is based on community-level
                  influence. That is, we aim at modeling communities
                  through the lenses of social contagion. By fitting
                  the model parameters to the user activity log, we
                  learn the community membership and the level of
                  influence of each user in each community. The
                  general framework is instantiated with two different
                  diffusion models, one with discrete time and one
                  with continuous time, and we show that the
                  computational complexity of both approaches is
                  linear in the number of users and in the size of the
                  propagation log. Experiments on synthetic data with
                  planted community structure show that our methods
                  outperform non-trivial baselines. The effectiveness
                  of the proposed techniques is further validated on
                  real-word data, on which our methods are able to
                  detect high-quality communities.}
}

@article{Manco:2016,
  author = {Manco, Giuseppe and Rullo, Pasquale and Gallucci,
                  Lorenzo and Paturzo, Mirko},
  title = {Rialto: A Knowledge Discovery suite for data
                  analysis},
  journal = {Expert Syst. Appl.},
  issue_date = {October 2016},
  volume = {59},
  number = {C},
  url = {https://www.dropbox.com/s/h52bbs3qpldazyh/ESWA_10642_final.pdf?dl=0},
  month = oct,
  year = {2016},
  issn = {0957-4174},
  pages = {145--164},
  numpages = {20},
  url = {http://dx.doi.org/10.1016/j.eswa.2016.04.022},
  doi = {10.1016/j.eswa.2016.04.022},
  acmid = {2950691},
  publisher = {Pergamon Press, Inc.},
  address = {Tarrytown, NY, USA},
  keywords = {Business analytics platforms, Data mining, Knowledge
                  Discovery process},
  abstract = {A Knowledge Discovery (KD) process is a complex
                  inter-disciplinary task, where di↵erent types of
                  techniques coexist and cooperate for the purpose of
                  extract- ing useful knowledge from large amounts of
                  data. So, it is desirable having a unifying
                  environment, built on a formal basis, where to
                  design and perform the overall process. In this
                  paper we propose a general framework which for-
                  malizes a KD process as an algebraic expression,
                  that is, as a composition of operators representing
                  elementary operations on two worlds: the data and
                  the model worlds. Then, we describe a KD platform,
                  named Rialto, based on such a framework. In
                  particular, we provide the design principles of the
                  underlying architecture, highlight the basic
                  features, and provide a number of experimental
                  results aimed at assessing the e↵ectiveness of the
                  design choices.}
}

@article{QRE:QRE2008,
  author = {Coleman, Shirley and Göb, Rainer and Manco, Giuseppe
                  and Pievatolo, Antonio and Tort-Martorell, Xavier
                  and Reis, Marco Seabra},
  title = {How Can SMEs Benefit from Big Data? Challenges and a
                  Path Forward},
  journal = {Quality and Reliability Engineering International},
  volume = {32},
  number = {6},
  issn = {1099-1638},
  url = {http://www.enbis.org/dl/9313_0081943714.pdf/as/Coleman_et_al-How%20Can%20SMEs%20Benefit%20from%20Big%20Data.pdf?_ts=519&_ts=519},
  doi = {10.1002/qre.2008},
  pages = {2151--2164},
  keywords = {predictive analytics, maturity model, data science,
                  skills shortage},
  year = {2016},
  note = {QRE-15-0533.R1},
  abstract = {Big data is big news, and large companies in all
                  sectors are making significant advances in their
                  customer relations, product selection and
                  development and consequent profitability through
                  using this valuable commodity. Small and medium
                  enterprises (SMEs) have proved themselves to be slow
                  adopters of the new technology of big data analytics
                  and are in danger of being left behind. In Europe,
                  SMEs are a vital part of the economy, and the
                  challenges they encounter need to be addressed as a
                  matter of urgency. This paper identifies barriers to
                  SME uptake of big data analytics and recognises
                  their complex challenge to all stakeholders,
                  including national and international policy makers,
                  IT, business management and data science
                  communities.  The paper proposes a big data maturity
                  model for SMEs as a first step towards an SME
                  roadmap to data analytics. It considers the
                  ‘state-of-the-art’ of IT with respect to usability
                  and usefulness for SMEs and discusses how SMEs can
                  overcome the barriers preventing them from adopting
                  existing solutions. The paper then considers
                  management perspectives and the role of maturity
                  models in enhancing and structuring the adoption of
                  data analytics in an organisation. The history of
                  total quality management is reviewed to inform the
                  core aspects of implanting a new paradigm. The paper
                  concludes with recommendations to help SMEs develop
                  their big data capability and enable them to
                  continue as the engines of European industrial and
                  business success.}
}

@article{BBM2013,
  year = {2013},
  issn = {0219-1377},
  journal = {Knowledge and Information Systems},
  doi = {10.1007/s10115-013-0646-6},
  title = {Topic-aware social influence propagation models},
  url = {https://www.dropbox.com/s/hvcb12oqc3dr8j6/influence-2012.pdf?dl=0},
  publisher = {Springer-Verlag},
  keywords = {Social influence; Topic modeling; Topic-aware
                  propagation model; Viral marketing},
  author = {Barbieri, Nicola and Bonchi, Francesco and Manco,
                  Giuseppe},
  pages = {1-30},
  abstract = {The study of influence-driven propagations in social
                  networks and its exploitation for viral marketing
                  purposes has recently received a large deal of
                  attention. However, regardless of the fact that
                  users authoritativeness, expertise, trust and
                  influence are evidently topic-dependent, the
                  research on social influence has surprisingly
                  largely overlooked this aspect. In this article, we
                  study social influence from a topic modeling
                  perspective. We introduce novel topic-aware
                  influence-driven propagation models that, as we show
                  in our experiments, are more accurate in describing
                  real-world cascades than the standard (i.e.,
                  topic-blind) propagation models studied in the
                  literature. In particular, we first propose simple
                  topic-aware extensions of the well-known Independent
                  Cascade and Linear Threshold models. However, these
                  propagation models have a very large number of
                  parameters which could lead to
                  overfitting. Therefore, we propose a different
                  approach explicitly modeling authoritativeness,
                  influence and relevance under a topic-aware
                  perspective. Instead of considering user-to-user
                  influence, the proposed model focuses on user
                  authoritativeness and interests in a topic, leading
                  to a drastic reduction in the number of parameters
                  of the model. We devise methods to learn the
                  parameters of the models from a data set of past
                  propagations. Our experimentation confirms the high
                  accuracy of the proposed models and learning
                  schemes.}
}

@article{BarbieriMRCB13,
  author = {Nicola Barbieri and Giuseppe Manco and Ettore
                  Ritacco and Marco Carnuccio and Antonio Bevacqua},
  title = {Probabilistic topic models for sequence data},
  journal = {Machine Learning},
  volume = {93},
  number = {1},
  year = {2013},
  pages = {5-29},
  doi = {http://dx.doi.org/10.1007/s10994-013-5391-2},
  url = {https://www.dropbox.com/s/hvf9rnorqbntqge/ECMLPKDD2013MLJ.pdf?dl=0},
  abstract = {Probabilistic topic models are widely used in
                  different contexts to uncover the hidden structure
                  in large text corpora. One of the main (and perhaps
                  strong) assumption of these models is that
                  generative process follows a bag-of-words
                  assumption, i.e. each token is independent from the
                  previous one. We extend the popular Latent Dirichlet
                  Allocation model by exploiting three different
                  conditional Markovian assumptions: (i) the token
                  generation depends on the current topic and on the
                  previous token; (ii) the topic associated with each
                  observation depends on topic associated with the
                  previous one; (iii) the token generation depends on
                  the current and previous topic. For each of these
                  modeling assumptions we present a Gibbs Sampling
                  procedure for parameter estimation. Experimental
                  evaluation over real-word data shows the performance
                  advantages, in terms of recall and precision, of the
                  sequence-modeling approaches.}
}

@article{Costa201326,
  author = {Gianni Costa and Giuseppe Manco and Riccardo Ortale
                  and Ettore Ritacco},
  title = {Hierarchical clustering of XML documents focused on
                  structural components},
  journal = {Data & Knowledge Engineering},
  volume = {84},
  number = {0},
  pages = {26 - 46},
  year = {2013},
  doi = {10.1016/j.datak.2012.12.002},
  url = {https://www.dropbox.com/s/e7mxy6zrezque94/XML_Clustering_DKE.pdf?dl=0},
  abstract = {Clustering XML documents by structure is the task of
                  grouping them by common structural
                  components. Hitherto, this has been accomplished by
                  looking at the occurrence of one preestablished type
                  of structural components in the structures of the
                  XML documents. However, the a-priori chosen
                  structural components may not be the most
                  appropriate for effective clustering. Moreover, it
                  is likely that the resulting clusters exhibit a
                  certain extent of inner structural inhomogeneity,
                  because of uncaught differences in the structures of
                  the XML documents, due to further neglected forms of
                  structural components.  To overcome these
                  limitations, a new hierarchical approach is
                  proposed, that allows to consider (if necessary)
                  multiple forms of structural components to isolate
                  structurally-homogeneous clusters of XML
                  documents. At each level of the resulting hierarchy,
                  clusters are divided by considering some type of
                  structural components (unaddressed at the preceding
                  levels), that still differentiate the structures of
                  the XML documents. Each cluster in the hierarchy is
                  summarized through a novel technique, that provides
                  a clear and differentiated understanding of its
                  structural properties.  A comparative evaluation
                  over both real and synthetic XML data proves that
                  the devised approach outperforms established
                  competitors in effectiveness and
                  scalability. Cluster summarization is also shown to
                  be very representative.}
}

@article{DBLP:journals/kais/CostaMOR11,
  author = {Gianni Costa and Giuseppe Manco and Riccardo Ortale
                  and Ettore Ritacco},
  title = {From global to local and viceversa: uses of
                  associative rule learning for classification in
                  imprecise environments},
  journal = {Knowl. Inf. Syst.},
  volume = {33},
  number = {1},
  year = {2011},
  pages = {137-169},
  url = {https://www.dropbox.com/s/su4jqahygicwdwb/kais2011_final.pdf?dl=0},
  abstract = {We propose two models for improving the performance
                  of rule-based classification under unbalanced and
                  highly imprecise domains. Both models are
                  probabilistic frameworks aimed to boost the
                  performance of basic rule-based classifiers. The
                  first model implements a global-to-local scheme,
                  where the response of a global rule-based classifier
                  is refined by performing a probabilistic analysis of
                  the coverage of its rules. In particular, the
                  coverage of the individual rules is used to learn
                  local probabilistic models, which ultimately refine
                  the predictions from the corresponding rules of the
                  global classifier. The second model implements a
                  dual local-to-global strategy, in which single
                  classification rules are combined within an
                  exponential probabilistic model in order to boost
                  the overall performance as a side effect of mutual
                  influence. Several variants of the basic ideas are
                  studied, and their perfor- mances are thoroughly
                  evaluated and compared with state-of-the-art
                  algorithms on standard benchmark datasets.}
}

@article{DBLP:journals/datamine/CostaMO10,
  author = {Gianni Costa and Giuseppe Manco and Riccardo Ortale},
  title = {An incremental clustering scheme for data
                  de-duplication},
  journal = {Data Min. Knowl. Discov.},
  volume = {20},
  number = {1},
  year = {2010},
  pages = {152-187},
  url = {http://dx.doi.org/10.1007/s10618-009-0155-0},
  abstract = {We propose an incremental technique for discovering
                  duplicates in large databases of textual sequences,
                  i.e., syntactically different tuples, that refer to
                  the same real-world entity. The problem is
                  approached from a clustering perspective: given a
                  set of tuples, the objective is to partition them
                  into groups of duplicate tuples. Each newly arrived
                  tuple is assigned to an appropriate cluster via
                  nearest-neighbor classifi- cation. This is achieved
                  by means of a suitable hash-based index, that maps
                  any tuple to a set of indexing keys and assigns
                  tuples with high syntactic similarity to the same
                  buckets. Hence, the neighbors of a query tuple can
                  be efficiently identified by simply retrieving those
                  tuples that appear in the same buckets associated to
                  the query tuple itself, without completely scanning
                  the original database. Two alternative schemes for
                  computing indexing keys are discussed and
                  compared. An extensive experimental evaluation on
                  both synthetic and real data shows the effectiveness
                  of our approach.}
}

@article{DBLP:journals/kais/CesarioFLMO08,
  author = {Eugenio Cesario and Francesco Folino and Antonio
                  Locane and Giuseppe Manco and Riccardo Ortale},
  title = {Boosting text segmentation via progressive
                  classification},
  journal = {Knowl. Inf. Syst.},
  volume = {15},
  number = {3},
  year = {2008},
  pages = {285-320},
  url = {http://dx.doi.org/10.1007/s10115-007-0085-3},
  abstract = {A novel approach for reconciling tuples stored as
                  free text into an existing attribute schema is
                  proposed. The basic idea is to subject the available
                  text to progressive classification, i.e., a
                  multi-stage classification scheme where, at each
                  intermediate stage, a classifier is learnt that
                  analyzes the textual fragments not reconciled at the
                  end of the previous steps. Classifica- tion is
                  accomplished by an ad hoc exploitation of
                  traditional association mining algorithms, and is
                  supported by a data transformation scheme which
                  takes advantage of domain-specific
                  dictionaries/ontologies. A key feature is the
                  capability of progressively enriching the avail-
                  able ontology with the results of the previous
                  stages of classification, thus significantly
                  improving the overall classification accuracy. An
                  extensive experimental evaluation shows the
                  effectiveness of our approach.}
}

@article{DBLP:journals/jiis/MancoMT08,
  author = {Giuseppe Manco and Elio Masciari and Andrea
                  Tagarelli},
  title = {Mining categories for emails via clustering and
                  pattern discovery},
  journal = {J. Intell. Inf. Syst.},
  volume = {30},
  number = {2},
  year = {2008},
  pages = {153-181},
  url = {http://dx.doi.org/10.1007/s10844-006-0024-x},
  abstract = {The continuous exchange of information by means of
                  the popular email service has raised the problem of
                  managing the huge amounts of messages received from
                  users in an effective and efficient way. We deal
                  with the problem of email classification by
                  conceiving suitable strategies for: (1) organizing
                  messages into homogeneous groups, (2) redirecting
                  further incoming messages according to an initial
                  organization, and (3) building reliable descriptions
                  of the message groups discovered. We propose a
                  unified framework for handling and classifying email
                  messages. In our framework, messages sharing similar
                  features are clustered in a folder
                  organization. Clustering and pattern discovery
                  techniques for mining struc- tured and unstructured
                  information from email messages are the basis of an
                  overall process of folder creation/maintenance and
                  email redirection. Pattern discovery is also
                  exploited for generating suitable cluster
                  descriptions that play a leading role in cluster
                  updating. Experimental evaluation performed on
                  several personal mailboxes shows the effectiveness
                  of our approach.}
}

@article{DBLP:journals/dke/FlescaMMPP07,
  author = {Sergio Flesca and Giuseppe Manco and Elio Masciari
                  and Luigi Pontieri and Andrea Pugliese},
  title = {Exploiting structural similarity for effective Web
                  information extraction},
  journal = {Data Knowl. Eng.},
  volume = {60},
  number = {1},
  year = {2007},
  pages = {222-234},
  url = {http://dx.doi.org/10.1016/j.datak.2006.01.001},
  abstract = {In this paper, we propose a classification technique
                  for Web pages, based on the detection of structural
                  similarities among semistructured documents, and
                  devise an architecture exploiting such technique for
                  the purpose of information extraction. The proposal
                  significantly differs from standard methods based on
                  graph-matching algorithms, and is based on the idea
                  of representing the structure of a document as a
                  time series in which each occurrence of a tag
                  corresponds to an impulse. The degree of similarity
                  between documents is then stated by analyzing the
                  frequencies of the corresponding Fourier
                  transform. Experiments on real data show the
                  effectiveness of the proposed technique.}
}

@article{DBLP:journals/tkde/CesarioMO07,
  author = {Eugenio Cesario and Giuseppe Manco and Riccardo
                  Ortale},
  title = {Top-Down Parameter-Free Clustering of
                  High-Dimensional Categorical Data},
  journal = {IEEE Trans. Knowl. Data Eng.},
  volume = {19},
  number = {12},
  year = {2007},
  pages = {1607-1624},
  url = {http://dx.doi.org/10.1109/TKDE.2007.190649},
  abstract = {A parameter-free, fully-automatic approach to
                  clustering high-dimensional categorical data is
                  proposed. The technique is based on a two-phase
                  iterative procedure, which attempts to improve the
                  overall quality of the whole partition. In the first
                  phase, cluster assignments are given, and a new
                  cluster is added to the partition by identifying and
                  splitting a low-quality cluster. In the second
                  phase, the number of clusters is fixed, and an
                  attempt to optimize cluster assignments is done. On
                  the basis of such features, the algorithm attempts
                  to improve the overall quality of the whole
                  partition and finds clusters in the data, whose
                  number is naturally established on the basis of the
                  inherent features of the underlying data set rather
                  than being previously specified. Furthermore, the
                  approach is parametric to the notion of cluster
                  quality: Here, a cluster is defined as a set of
                  tuples exhibiting a sort of homogeneity. We show how
                  a suitable notion of cluster homogeneity can be
                  defined in the context of high-dimensional
                  categorical data, from which an effective instance
                  of the proposed clustering scheme immediately
                  follows. Experiments on both synthetic and real data
                  prove that the devised algorithm scales linearly and
                  achieves nearly optimal results in terms of
                  compactness and separation.}
}

@article{DBLP:journals/is/GrecoGMS07,
  author = {Gianluigi Greco and Antonella Guzzo and Giuseppe
                  Manco and Domenico Sacc{\`a}},
  title = {Mining unconnected patterns in workflows},
  journal = {Inf. Syst.},
  volume = {32},
  number = {5},
  year = {2007},
  pages = {685-712},
  url = {http://dx.doi.org/10.1016/j.is.2006.05.001},
  abstract = {General patterns of execution that have been
                  frequently scheduled by a workflow management system
                  provide the administrator with previously unknown,
                  and potentially useful information, e.g., about the
                  existence of unexpected causalities between
                  subprocesses of a given workflow. This paper
                  investigates the problem of mining unconnected
                  patterns on the basis of some execution traces,
                  i.e., of detecting sets of activities exhibiting no
                  explicit dependency relationships that are
                  frequently executed together. The problem is faced
                  in the paper by proposing and analyzing two
                  algorithms. One algorithm takes into account
                  information about the structure of the control-flow
                  graph only, while the other is a smart refinement
                  where the knowledge of the frequencies of edges and
                  activities in the traces at hand is also accounted
                  for, by means of a sophisticated graphical
                  analysis. Both algorithms have been implemented and
                  integrated into a system prototype, which may
                  profitably support the enactment phase of the
                  workflow. The correctness of the two algorithms is
                  formally proven, and several experiments are
                  reported to evidence the ability of the graphical
                  analysis to significantly improve the performances,
                  by dramatically pruning the search space of
                  candidate patterns.}
}

@article{DBLP:journals/tkde/FlescaMMPP05,
  author = {Sergio Flesca and Giuseppe Manco and Elio Masciari
                  and Luigi Pontieri and Andrea Pugliese},
  title = {Fast Detection of XML Structural Similarity},
  journal = {IEEE Trans. Knowl. Data Eng.},
  volume = {17},
  number = {2},
  year = {2005},
  pages = {160-175},
  url = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2005.27},
  abstract = {Because of the widespread diffusion of
                  semistructured data in XML format, much research
                  effort is currently devoted to support the storage
                  and retrieval of large collections of such
                  documents. XML documents can be compared as to their
                  structural similarity, in order to group them into
                  clusters so that different storage, retrieval, and
                  processing techniques can be effectively
                  exploited. In this scenario, an efficient and
                  effective similarity function is the key of a
                  successful data management process. We present an
                  approach for detecting structural similarity between
                  XML documents which significantly differs from
                  standard methods based on graph-matching algorithms,
                  and allows a significant reduction of the required
                  computation costs. Our proposal roughly consists of
                  linearizing the structure of each XML document, by
                  representing it as a numerical sequence and, then,
                  comparing such sequences through the analysis of
                  their frequencies. First, some basic strategies for
                  encoding a document are proposed, which can focus on
                  diverse structural facets. Moreover, the theory of
                  Discrete Fourier Transform is exploited to
                  effectively and efficiently compare the encoded
                  documents (i.e., signals) in the domain of
                  frequencies. Experimental results reveal the
                  effectiveness of the approach, also in comparison
                  with standard methods.}
}

@article{DBLP:journals/tkde/GrecoGMS05,
  author = {Gianluigi Greco and Antonella Guzzo and Giuseppe
                  Manco and Domenico Sacc{\`a}},
  title = {Mining and Reasoning on Workflows},
  journal = {IEEE Trans. Knowl. Data Eng.},
  volume = {17},
  number = {4},
  year = {2005},
  pages = {519-534},
  url = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2005.63},
  abstract = {Today’s workflow management systems represent a key
                  technological infrastructure for advanced
                  applications that is attracting a growing body of
                  research, mainly focused in developing tools for
                  workflow management, that allow users both to
                  specify the “static” aspects, like preconditions,
                  precedences among activities, and rules for
                  exception handling, and to control its execution by
                  scheduling the activities on the available
                  resources. This paper deals with an aspect of
                  workflows which has so far not received much
                  attention even though it is crucial for the
                  forthcoming scenarios of large scale applications on
                  the Web: Providing facilities for the human system
                  administrator for identifying the choices performed
                  more frequently in the past that had lead to a
                  desired final configuration. In this context, we
                  formalize the problem of discovering the most
                  frequent patterns of executions, i.e., the workflow
                  substructures that have been scheduled more
                  frequently by the system. We attacked the problem by
                  developing two data mining algorithms on the basis
                  of an intuitive and original graph formalization of
                  a workflow schema and its occurrences. The model is
                  used both to prove some intractability results that
                  strongly motivate the use of data mining techniques
                  and to derive interesting structural properties for
                  reducing the search space for frequent
                  patterns. Indeed, the experiments we have carried
                  out show that our algorithms outperform standard
                  data mining algorithms adapted to discover frequent
                  patterns of workflow executions.}
}

This file was generated by bibtex2html 1.96.